statsample 0.6.5 → 0.6.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. data/History.txt +15 -0
  2. data/Manifest.txt +6 -0
  3. data/README.txt +30 -12
  4. data/Rakefile +91 -0
  5. data/demo/levene.rb +9 -0
  6. data/demo/multiple_regression.rb +1 -7
  7. data/demo/polychoric.rb +1 -0
  8. data/demo/principal_axis.rb +8 -0
  9. data/lib/distribution/f.rb +22 -22
  10. data/lib/spss.rb +99 -99
  11. data/lib/statsample/bivariate/polychoric.rb +32 -22
  12. data/lib/statsample/bivariate/tetrachoric.rb +212 -207
  13. data/lib/statsample/bivariate.rb +6 -6
  14. data/lib/statsample/codification.rb +65 -65
  15. data/lib/statsample/combination.rb +60 -59
  16. data/lib/statsample/converter/csv19.rb +12 -12
  17. data/lib/statsample/converters.rb +1 -1
  18. data/lib/statsample/dataset.rb +93 -36
  19. data/lib/statsample/dominanceanalysis/bootstrap.rb +66 -3
  20. data/lib/statsample/dominanceanalysis.rb +5 -6
  21. data/lib/statsample/factor/pca.rb +41 -11
  22. data/lib/statsample/factor/principalaxis.rb +105 -29
  23. data/lib/statsample/factor/rotation.rb +20 -3
  24. data/lib/statsample/factor.rb +1 -1
  25. data/lib/statsample/graph/gdchart.rb +13 -13
  26. data/lib/statsample/graph/svggraph.rb +166 -167
  27. data/lib/statsample/matrix.rb +22 -12
  28. data/lib/statsample/mle/logit.rb +3 -2
  29. data/lib/statsample/mle/probit.rb +7 -5
  30. data/lib/statsample/mle.rb +4 -2
  31. data/lib/statsample/multiset.rb +125 -124
  32. data/lib/statsample/permutation.rb +2 -1
  33. data/lib/statsample/regression/binomial/logit.rb +4 -3
  34. data/lib/statsample/regression/binomial/probit.rb +2 -1
  35. data/lib/statsample/regression/binomial.rb +62 -81
  36. data/lib/statsample/regression/multiple/baseengine.rb +1 -1
  37. data/lib/statsample/regression/multiple/gslengine.rb +1 -1
  38. data/lib/statsample/regression/multiple/matrixengine.rb +12 -6
  39. data/lib/statsample/regression/multiple.rb +15 -42
  40. data/lib/statsample/regression/simple.rb +93 -78
  41. data/lib/statsample/regression.rb +74 -2
  42. data/lib/statsample/reliability.rb +117 -120
  43. data/lib/statsample/srs.rb +156 -153
  44. data/lib/statsample/test/levene.rb +90 -0
  45. data/lib/statsample/test/umannwhitney.rb +25 -9
  46. data/lib/statsample/test.rb +2 -0
  47. data/lib/statsample/vector.rb +388 -413
  48. data/lib/statsample.rb +74 -30
  49. data/po/es/statsample.mo +0 -0
  50. data/test/test_bivariate.rb +5 -4
  51. data/test/test_combination.rb +1 -1
  52. data/test/test_dataset.rb +2 -2
  53. data/test/test_factor.rb +53 -6
  54. data/test/test_gsl.rb +1 -1
  55. data/test/test_mle.rb +1 -1
  56. data/test/test_regression.rb +18 -33
  57. data/test/test_statistics.rb +15 -33
  58. data/test/test_stest.rb +35 -0
  59. data/test/test_svg_graph.rb +2 -2
  60. data/test/test_vector.rb +331 -333
  61. metadata +38 -11
@@ -1,22 +1,23 @@
1
1
  require 'statsample/vector'
2
2
 
3
3
  class Hash
4
+ # Creates a Statsample::Dataset based on a Hash
4
5
  def to_dataset(*args)
5
6
  Statsample::Dataset.new(self,*args)
6
7
  end
7
8
  end
8
9
 
9
10
  class Array
10
- def prefix(s)
11
+ def prefix(s) # :nodoc:
11
12
  self.collect{|c| s+c.to_s }
12
13
  end
13
- def suffix(s)
14
+ def suffix(s) # :nodoc:
14
15
  self.collect{|c| c.to_s+s }
15
16
  end
16
17
  end
17
18
 
18
19
  module Statsample
19
- class DatasetException < RuntimeError
20
+ class DatasetException < RuntimeError # :nodoc:
20
21
  attr_reader :ds,:exp
21
22
  def initialize(ds,e)
22
23
  @ds=ds
@@ -28,15 +29,49 @@ module Statsample
28
29
  m
29
30
  end
30
31
  end
32
+ # Set of cases with values for one or more variables,
33
+ # analog to a dataframe on R or a standard data file of SPSS.
34
+ # Every vector has <tt>#field</tt> name, which represent it. By default,
35
+ # the vectors are ordered by it field name, but you can change it
36
+ # the fields order manually.
37
+ # The Dataset work as a Hash, with keys are field names
38
+ # and values are Statsample::Vector
39
+ #
40
+ #
41
+ # ==Usage
42
+ # Create a empty dataset
43
+ # Dataset.new()
44
+ # Create a dataset with three empty vectors, called <tt>v1</tt>, <tt>v2</tt> and <tt>v3</tt>
45
+ # Dataset.new(%w{v1 v2 v3})
46
+ # Create a dataset with two vectors
47
+ # Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector})
48
+ # Create a dataset with two given vectors (v1 and v2), with vectors on inverted order
49
+ # Dataset.new({'v2'=>v2,'v1'=>v1},['v2','v1'])
50
+ #
51
+ # The fast way to create a dataset uses Hash#to_dataset, with
52
+ # field order as arguments
53
+ # v1 = [1,2,3].to_scale
54
+ # v2 = [1,2,3].to_scale
55
+ # ds = {'v1'=>v2, 'v2'=>v2}.to_dataset(%w{v2 v1})
56
+
31
57
  class Dataset
32
58
  include Writable
33
- attr_reader :vectors, :fields, :cases, :i
59
+ # Hash of Statsample::Vector
60
+ attr_reader :vectors
61
+ # Ordered names of vectors
62
+ attr_reader :fields
63
+ # Number of cases
64
+ attr_reader :cases
65
+ # Location of pointer on enumerations methods (like #each)
66
+ attr_reader :i
67
+ # Deprecated: Label of vectors
34
68
  attr_accessor :labels
35
69
 
36
70
  # Generates a new dataset, using three vectors
37
71
  # - Rows
38
72
  # - Columns
39
73
  # - Values
74
+ #
40
75
  # For example, you have these values
41
76
  #
42
77
  # x y v
@@ -88,16 +123,7 @@ module Statsample
88
123
  # order of variables. If empty, vectors keys on alfabethic order as
89
124
  # used as fields
90
125
  # [labels] Hash to set names for fields.
91
- #
92
- #
93
- # Dataset.new()
94
- # Dataset.new(%w{v1 v2 v3})
95
- # Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector})
96
- # Dataset.new({'v2'=>v2,'v1'=>v1},['v1','v2'])
97
- #
98
- # The fast way to create a dataset uses Hash#to_dataset, with
99
- # fields and labels as arguments
100
- # ds = {'v1'=>[1,2,3].to_vector}.to_dataset
126
+
101
127
  #
102
128
  def initialize(vectors={}, fields=[], labels={})
103
129
  if vectors.instance_of? Array
@@ -120,7 +146,8 @@ module Statsample
120
146
  end
121
147
  matrix
122
148
  end
123
- def label(v_id)
149
+ # Retrieves label for a vector, giving a field name.
150
+ def label(v_id)
124
151
  raise "Vector #{v} doesn't exists" unless @fields.include? v_id
125
152
  @labels[v_id].nil? ? v_id : @labels[v_id]
126
153
  end
@@ -233,12 +260,20 @@ module Statsample
233
260
  ds_boot.update_valid_data
234
261
  ds_boot
235
262
  end
236
- # Fast version of add case
263
+ # Fast version of #add_case.
237
264
  # Can only add one case and no error check if performed
238
- # You SHOULD use update_valid_data at the end of insertion cycle
265
+ # You SHOULD use #update_valid_data at the end of insertion cycle
239
266
  def add_case_array(v)
240
267
  v.each_index {|i| d=@vectors[@fields[i]].data; d.push(v[i])}
241
268
  end
269
+ # Insert a case, using:
270
+ # * Array: size equal to number of vectors and values in the same order as fields
271
+ # * Hash: keys equal to fields
272
+ # If uvd is false, #update_valid_data is not executed after
273
+ # inserting a case. This is very useful if you want to increase the
274
+ # performance on inserting many cases,
275
+ # because #update_valid_data performs check on vectors and on the dataset
276
+
242
277
  def add_case(v,uvd=true)
243
278
  case v
244
279
  when Array
@@ -258,14 +293,18 @@ module Statsample
258
293
  update_valid_data
259
294
  end
260
295
  end
296
+ # Check vectors and fields after inserting data. Use only
297
+ # after #add_case_array or #add_case with second parameter to false
261
298
  def update_valid_data
262
299
  @fields.each{|f| @vectors[f].set_valid_data}
263
300
  check_length
264
301
  end
302
+ # Delete a vector
265
303
  def delete_vector(name)
266
304
  @fields.delete(name)
267
305
  @vectors.delete(name)
268
306
  end
307
+
269
308
  def add_vectors_by_split_recode(name,join='-',sep=Statsample::SPLIT_TOKEN)
270
309
  split=@vectors[name].split_by_separator(sep)
271
310
  i=1
@@ -294,7 +333,7 @@ module Statsample
294
333
  def vector_sum(fields=nil)
295
334
  a=[]
296
335
  fields||=@fields
297
- collect_with_index do |i,row|
336
+ collect_with_index do |row, i|
298
337
  if(fields.find{|f| !@vectors[f].data_with_nils[i]})
299
338
  nil
300
339
  else
@@ -302,16 +341,17 @@ module Statsample
302
341
  end
303
342
  end
304
343
  end
344
+ # Check if #fields attribute is correct, after inserting or deleting vectors
305
345
  def check_fields(fields)
306
346
  fields||=@fields
307
347
  raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0
308
348
  fields
309
349
  end
350
+
310
351
  # Returns a vector with the numbers of missing values for a case
311
-
312
352
  def vector_missing_values(fields=nil)
313
353
  fields=check_fields(fields)
314
- collect_with_index do |i,row|
354
+ collect_with_index do |row, i|
315
355
  fields.inject(0) {|a,v|
316
356
  a+ ((@vectors[v].data_with_nils[i].nil?) ? 1: 0)
317
357
  }
@@ -319,9 +359,8 @@ module Statsample
319
359
  end
320
360
  def vector_count_characters(fields=nil)
321
361
  fields=check_fields(fields)
322
- collect_with_index do |i,row|
362
+ collect_with_index do |row, i|
323
363
  fields.inject(0){|a,v|
324
-
325
364
  a+((@vectors[v].data_with_nils[i].nil?) ? 0: row[v].to_s.size)
326
365
  }
327
366
  end
@@ -353,7 +392,8 @@ module Statsample
353
392
  end
354
393
  a.to_vector(:scale)
355
394
  end
356
- def check_length
395
+ # Check vectors for type and size.
396
+ def check_length # :nodoc:
357
397
  size=nil
358
398
  @vectors.each do |k,v|
359
399
  raise Exception, "Data #{v.class} is not a vector on key #{k}" if !v.is_a? Statsample::Vector
@@ -368,16 +408,19 @@ module Statsample
368
408
  end
369
409
  @cases=size
370
410
  end
371
- def each_vector
372
- @fields.each{|k| yield k,@vectors[k]}
411
+ # Retrieves each vector as [key, vector]
412
+ def each_vector # :yield: |key, vector|
413
+ @fields.each{|k| yield k, @vectors[k]}
373
414
  end
415
+
374
416
  if Statsample::STATSAMPLE__.respond_to?(:case_as_hash)
375
417
  def case_as_hash(c) # :nodoc:
376
418
  Statsample::STATSAMPLE__.case_as_hash(self,c)
377
419
  end
378
420
  else
379
- def case_as_hash(c)
380
- _case_as_hash(c)
421
+ # Retrieves case i as a hash
422
+ def case_as_hash(i)
423
+ _case_as_hash(i)
381
424
  end
382
425
  end
383
426
 
@@ -386,8 +429,9 @@ module Statsample
386
429
  Statsample::STATSAMPLE__.case_as_array(self,c)
387
430
  end
388
431
  else
389
- def case_as_array(c)
390
- _case_as_array(c)
432
+ # Retrieves case i as a array, ordered on #fields order
433
+ def case_as_array(i)
434
+ _case_as_array(i)
391
435
  end
392
436
  end
393
437
  def _case_as_hash(c) # :nodoc:
@@ -396,6 +440,7 @@ module Statsample
396
440
  def _case_as_array(c) # :nodoc:
397
441
  @fields.collect {|x| @vectors[x][c]}
398
442
  end
443
+
399
444
  # Returns each case as a hash
400
445
  def each
401
446
  begin
@@ -411,7 +456,7 @@ module Statsample
411
456
  end
412
457
  end
413
458
  # Returns each case as hash and index
414
- def each_with_index
459
+ def each_with_index # :yield: |case, i|
415
460
  begin
416
461
  @i=0
417
462
  @cases.times{|i|
@@ -447,6 +492,7 @@ module Statsample
447
492
  }
448
493
  @i=nil
449
494
  end
495
+ # Set fields order. If you omit one or more vectors,
450
496
  def fields=(f)
451
497
  @fields=f
452
498
  check_order
@@ -470,6 +516,8 @@ module Statsample
470
516
  raise ArgumentError, "You need a String or a Range"
471
517
  end
472
518
  end
519
+ # Retrieves a Statsample::Vector, based on the result
520
+ # of calculation performed on each case.
473
521
  def collect(type=:scale)
474
522
  data=[]
475
523
  each {|row|
@@ -477,10 +525,11 @@ module Statsample
477
525
  }
478
526
  Statsample::Vector.new(data,type)
479
527
  end
528
+ # Same as #collect, but giving case index as second parameter on yield.
480
529
  def collect_with_index(type=:scale)
481
530
  data=[]
482
531
  each_with_index {|row, i|
483
- data.push(yield(i,row))
532
+ data.push(yield(row, i))
484
533
  }
485
534
  Statsample::Vector.new(data,type)
486
535
  end
@@ -504,6 +553,8 @@ module Statsample
504
553
  raise ArgumentError,"Should pass a Statsample::Vector"
505
554
  end
506
555
  end
556
+ # Return data as a matrix. Column are ordered by #fields and
557
+ # rows by orden of insertion
507
558
  def to_matrix
508
559
  rows=[]
509
560
  self.each_array{|c|
@@ -511,7 +562,8 @@ module Statsample
511
562
  }
512
563
  Matrix.rows(rows)
513
564
  end
514
- if HAS_GSL
565
+
566
+ if Statsample.has_gsl?
515
567
  def to_matrix_gsl
516
568
  rows=[]
517
569
  self.each_array{|c|
@@ -520,15 +572,17 @@ module Statsample
520
572
  GSL::Matrix.alloc(*rows)
521
573
  end
522
574
  end
523
- def to_multiset_by_split(*fields)
575
+
576
+ def to_multiset_by_split(*fields)
524
577
  require 'statsample/multiset'
525
578
  if fields.size==1
526
579
  to_multiset_by_split_one_field(fields[0])
527
580
  else
528
581
  to_multiset_by_split_multiple_fields(*fields)
529
582
  end
530
- end
531
- # create a new dataset with all the data which the block returns true
583
+ end
584
+
585
+ # Create a new dataset with all cases which the block returns true
532
586
  def filter
533
587
  ds=self.dup_empty
534
588
  each {|c|
@@ -537,6 +591,7 @@ module Statsample
537
591
  ds.update_valid_data
538
592
  ds
539
593
  end
594
+
540
595
  # creates a new vector with the data of a given field which the block returns true
541
596
  def filter_field(field)
542
597
  a=[]
@@ -545,6 +600,7 @@ module Statsample
545
600
  }
546
601
  a.to_vector(@vectors[field].type)
547
602
  end
603
+
548
604
  def to_multiset_by_split_one_field(field)
549
605
  raise ArgumentError,"Should use a correct field name" if !@fields.include? field
550
606
  factors=@vectors[field].factors
@@ -604,7 +660,7 @@ module Statsample
604
660
  text.gsub!(f,"row['#{f}']")
605
661
  end
606
662
  }
607
- collect_with_index {|i,row|
663
+ collect_with_index {|row, i|
608
664
  invalid=false
609
665
  @fields.each{|f|
610
666
  if @vectors[f].data_with_nils[i].nil?
@@ -653,6 +709,7 @@ module Statsample
653
709
  end
654
710
  # Creates a new dataset for one to many relations
655
711
  # on a dataset, based on pattern of field names.
712
+ #
656
713
  # for example, you have a survey for number of children
657
714
  # with this structure:
658
715
  # id, name, child_name_1, child_age_1, child_name_2, child_age_2
@@ -1,8 +1,70 @@
1
1
  module Statsample
2
2
  class DominanceAnalysis
3
+ # == Goal
3
4
  # Generates Bootstrap sample to identity the replicability of a Dominance Analysis. See Azen & Bodescu (2003) for more information.
4
- # References:
5
- # * Azen, R. & Budescu, D.V. (2003). The dominance analysis approach for comparing predictors in multiple regression. _Psychological Methods, 8_(2), 129-148.
5
+ #
6
+ # == Usage
7
+ #
8
+ # require 'statsample'
9
+ # a=100.times.collect {rand}.to_scale
10
+ # b=100.times.collect {rand}.to_scale
11
+ # c=100.times.collect {rand}.to_scale
12
+ # d=100.times.collect {rand}.to_scale
13
+ # ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset
14
+ # ds['y']=ds.collect{|row| row['a']*5+row['b']*2+row['c']*2+row['d']*2+10*rand()}
15
+ # dab=Statsample::DominanceAnalysis::Bootstrap.new(ds2, 'y', :debug=>true)
16
+ # dab.bootstrap(100,nil)
17
+ # puts dab.summary
18
+ # <strong>Output</strong>
19
+ # Sample size: 100
20
+ # t: 1.98421693632958
21
+ #
22
+ # Linear Regression Engine: Statsample::Regression::Multiple::MatrixEngine
23
+ # Table: Bootstrap report
24
+ # --------------------------------------------------------------------------------------------
25
+ # | pairs | sD | Dij | SE(Dij) | Pij | Pji | Pno | Reproducibility |
26
+ # --------------------------------------------------------------------------------------------
27
+ # | Complete dominance |
28
+ # --------------------------------------------------------------------------------------------
29
+ # | a - b | 1.0 | 0.6150 | 0.454 | 0.550 | 0.320 | 0.130 | 0.550 |
30
+ # | a - c | 1.0 | 0.9550 | 0.175 | 0.930 | 0.020 | 0.050 | 0.930 |
31
+ # | a - d | 1.0 | 0.9750 | 0.131 | 0.960 | 0.010 | 0.030 | 0.960 |
32
+ # | b - c | 1.0 | 0.8800 | 0.276 | 0.820 | 0.060 | 0.120 | 0.820 |
33
+ # | b - d | 1.0 | 0.9250 | 0.193 | 0.860 | 0.010 | 0.130 | 0.860 |
34
+ # | c - d | 0.5 | 0.5950 | 0.346 | 0.350 | 0.160 | 0.490 | 0.490 |
35
+ # --------------------------------------------------------------------------------------------
36
+ # | Conditional dominance |
37
+ # --------------------------------------------------------------------------------------------
38
+ # | a - b | 1.0 | 0.6300 | 0.458 | 0.580 | 0.320 | 0.100 | 0.580 |
39
+ # | a - c | 1.0 | 0.9700 | 0.156 | 0.960 | 0.020 | 0.020 | 0.960 |
40
+ # | a - d | 1.0 | 0.9800 | 0.121 | 0.970 | 0.010 | 0.020 | 0.970 |
41
+ # | b - c | 1.0 | 0.8850 | 0.283 | 0.840 | 0.070 | 0.090 | 0.840 |
42
+ # | b - d | 1.0 | 0.9500 | 0.181 | 0.920 | 0.020 | 0.060 | 0.920 |
43
+ # | c - d | 0.5 | 0.5800 | 0.360 | 0.350 | 0.190 | 0.460 | 0.460 |
44
+ # --------------------------------------------------------------------------------------------
45
+ # | General Dominance |
46
+ # --------------------------------------------------------------------------------------------
47
+ # | a - b | 1.0 | 0.6500 | 0.479 | 0.650 | 0.350 | 0.000 | 0.650 |
48
+ # | a - c | 1.0 | 0.9800 | 0.141 | 0.980 | 0.020 | 0.000 | 0.980 |
49
+ # | a - d | 1.0 | 0.9900 | 0.100 | 0.990 | 0.010 | 0.000 | 0.990 |
50
+ # | b - c | 1.0 | 0.9000 | 0.302 | 0.900 | 0.100 | 0.000 | 0.900 |
51
+ # | b - d | 1.0 | 0.9700 | 0.171 | 0.970 | 0.030 | 0.000 | 0.970 |
52
+ # | c - d | 1.0 | 0.5600 | 0.499 | 0.560 | 0.440 | 0.000 | 0.560 |
53
+ # --------------------------------------------------------------------------------------------
54
+ #
55
+ # Table: General averages
56
+ # ---------------------------------------
57
+ # | var | mean | se | p.5 | p.95 |
58
+ # ---------------------------------------
59
+ # | a | 0.133 | 0.049 | 0.062 | 0.218 |
60
+ # | b | 0.106 | 0.048 | 0.029 | 0.199 |
61
+ # | c | 0.035 | 0.032 | 0.002 | 0.106 |
62
+ # | d | 0.023 | 0.019 | 0.002 | 0.062 |
63
+ # ---------------------------------------
64
+ #
65
+ # == References:
66
+ #
67
+ # * Azen, R. & Budescu, D.V. (2003). The dominance analysis approach for comparing predictors in multiple regression. <em>Psychological Methods, 8</em>(2), 129-148.
6
68
  class Bootstrap
7
69
  include GetText
8
70
  include Writable
@@ -27,12 +89,13 @@ module Statsample
27
89
  attr_accessor :alpha
28
90
  # Debug?
29
91
  attr_accessor :debug
92
+ # Default level of confidence for t calculation
93
+ ALPHA=0.95
30
94
  # Create a new Dominance Analysis Bootstrap Object
31
95
  #
32
96
  # * ds: A Dataset object
33
97
  # * y_var: Name of dependent variable
34
98
  # * opts: Any other attribute of the class
35
- ALPHA=0.95
36
99
  def initialize(ds,y_var, opts=Hash.new)
37
100
  @ds=ds
38
101
  @y_var=y_var
@@ -1,13 +1,12 @@
1
1
  require 'statsample/dominanceanalysis/bootstrap'
2
2
  module Statsample
3
- # Dominance Analysis is a procedure based on an examination of the R^2 values
3
+ # Dominance Analysis is a procedure based on an examination of the R<sup>2</sup> values
4
4
  # for all possible subset models, to identify the relevance of one or more
5
5
  # predictors in the prediction of criterium.
6
6
  #
7
- #
8
7
  # See Budescu(1993), Azen & Budescu (2003, 2006) for more information.
9
8
  #
10
- # Example:
9
+ # == Use
11
10
  #
12
11
  # a=1000.times.collect {rand}.to_scale
13
12
  # b=1000.times.collect {rand}.to_scale
@@ -17,7 +16,7 @@ module Statsample
17
16
  # da=Statsample::DominanceAnalysis.new(ds,'y')
18
17
  # puts da.summary
19
18
  #
20
- # Output:
19
+ # === Output:
21
20
  #
22
21
  # Report: Report 2010-02-08 19:10:11 -0300
23
22
  # Table: Dominance Analysis result
@@ -51,12 +50,12 @@ module Statsample
51
50
  # | a - c | 1.0 | 1.0 | 1.0 |
52
51
  # | b - c | 1.0 | 1.0 | 1.0 |
53
52
  # -----------------------------------------
54
-
55
53
  #
56
54
  # == References:
57
55
  # * Budescu, D. V. (1993). Dominance analysis: a new approach to the problem of relative importance of predictors in multiple regression. <em>Psychological Bulletin, 114</em>, 542-551.
58
56
  # * Azen, R. & Budescu, D.V. (2003). The dominance analysis approach for comparing predictors in multiple regression. <em>Psychological Methods, 8</em>(2), 129-148.
59
57
  # * Azen, R. & Budescu, D.V. (2006). Comparing predictors in Multivariate Regression Models: An extension of Dominance Analysis. <em>Journal of Educational and Behavioral Statistics, 31</em>(2), 157-180.
58
+ #
60
59
  class DominanceAnalysis
61
60
  include GetText
62
61
  bindtextdomain("statsample")
@@ -366,7 +365,7 @@ module Statsample
366
365
  generator.parse_element(t)
367
366
  generator.add_html("</div>")
368
367
  end
369
- class ModelData
368
+ class ModelData # :nodoc:
370
369
  attr_reader :contributions
371
370
  def initialize(independent, data, da)
372
371
  @independent=independent
@@ -1,21 +1,42 @@
1
1
  module Statsample
2
2
  module Factor
3
- # Principal Component Analysis of a given covariance or correlation matrix.
4
- # For factorial Analysis, use Statsample::Factor::PrincipalAxis
5
- # Reference: SPSS manual
6
- # Use:
3
+ # Principal Component Analysis (PCA) of a
4
+ # covariance or correlation matrix.
5
+ #
6
+ # For Principal Axis Analysis, use Statsample::Factor::PrincipalAxis
7
+ #
8
+ # == Usage:
9
+ # require 'statsample'
7
10
  # a=[2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1].to_scale
8
11
  # b=[2.4,0.7,2.9,2.2,3.0,2.7,1.6,1.1,1.6,0.9].to_scale
9
12
  # ds={'a'=>a,'b'=>b}.to_dataset
10
13
  # cor_matrix=Statsample::Bivariate.correlation_matrix(ds)
11
14
  # pca=Statsample::Factor::PCA.new(cor_matrix)
12
- # p pca.component_matrix
15
+ # pca.m
16
+ # => 1
17
+ # pca.eigenvalues
18
+ # => [1.92592927269225, 0.0740707273077545]
19
+ # pca.component_matrix
20
+ # => GSL::Matrix
21
+ # [ 9.813e-01
22
+ # 9.813e-01 ]
23
+ # pca.communalities
24
+ # => [0.962964636346122, 0.962964636346122]
25
+ #
26
+ # == References:
27
+ #
28
+ # * SPSS manual
29
+ # * Smith, L. (2002). A tutorial on Principal Component Analysis. Available on http://courses.eas.ualberta.ca/eas570/pca_tutorial.pdf
30
+ #
13
31
  class PCA
14
- attr_accessor :name, :m
32
+ # Name of analysis
33
+ attr_accessor :name
34
+ # Number of factors. Set by default to the number of factors
35
+ # with eigen values > 1
36
+ attr_accessor :m
15
37
  include GetText
16
38
  bindtextdomain("statsample")
17
39
 
18
-
19
40
  def initialize(matrix ,opts=Hash.new)
20
41
  if matrix.respond_to? :to_gsl
21
42
  matrix=matrix.to_gsl
@@ -42,6 +63,7 @@ module Factor
42
63
  }
43
64
  @ds=h.to_dataset
44
65
  end
66
+
45
67
  # Feature vector for m factors
46
68
  def feature_vector(m=nil)
47
69
  m||=@m
@@ -69,10 +91,10 @@ module Factor
69
91
  gammas.push(Math::sqrt(@eigenpairs[i][0]))
70
92
  }
71
93
  gamma_m=GSL::Matrix.diagonal(gammas)
72
- omega_m*(gamma_m)
94
+ (omega_m*(gamma_m)).to_matrix
73
95
  end
74
- # Communality for all variables given m factors
75
- def communality(m=nil)
96
+ # Communalities for all variables given m factors
97
+ def communalities(m=nil)
76
98
  m||=@m
77
99
  h=[]
78
100
  @n_variables.times do |i|
@@ -84,9 +106,11 @@ module Factor
84
106
  end
85
107
  h
86
108
  end
109
+ # Array with eigenvalues
87
110
  def eigenvalues
88
111
  @eigenpairs.collect {|c| c[0] }
89
112
  end
113
+
90
114
  def calculate_eigenpairs
91
115
  eigval, eigvec= GSL::Eigen.symmv(@matrix)
92
116
  @eigenpairs={}
@@ -95,13 +119,18 @@ module Factor
95
119
  }
96
120
  @eigenpairs=@eigenpairs.sort.reverse
97
121
  end
122
+ def summary
123
+ rp=ReportBuilder.new()
124
+ rp.add(self)
125
+ rp.to_text
126
+ end
98
127
  def to_reportbuilder(generator) # :nodoc:
99
128
  anchor=generator.add_toc_entry(_("PCA: ")+name)
100
129
  generator.add_html "<div class='pca'>"+_("PCA")+" #{@name}<a name='#{anchor}'></a>"
101
130
 
102
131
  generator.add_text "Number of factors: #{m}"
103
132
  t=ReportBuilder::Table.new(:name=>_("Communalities"), :header=>["Variable","Initial","Extraction"])
104
- communality(m).each_with_index {|com,i|
133
+ communalities(m).each_with_index {|com,i|
105
134
  t.add_row([i, 1.0, sprintf("%0.3f", com)])
106
135
  }
107
136
  generator.parse_element(t)
@@ -122,6 +151,7 @@ module Factor
122
151
  generator.parse_element(t)
123
152
  generator.add_html("</div>")
124
153
  end
154
+ private :calculate_eigenpairs, :create_centered_ds
125
155
  end
126
156
  end
127
157
  end