statsample 0.6.5 → 0.6.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +15 -0
- data/Manifest.txt +6 -0
- data/README.txt +30 -12
- data/Rakefile +91 -0
- data/demo/levene.rb +9 -0
- data/demo/multiple_regression.rb +1 -7
- data/demo/polychoric.rb +1 -0
- data/demo/principal_axis.rb +8 -0
- data/lib/distribution/f.rb +22 -22
- data/lib/spss.rb +99 -99
- data/lib/statsample/bivariate/polychoric.rb +32 -22
- data/lib/statsample/bivariate/tetrachoric.rb +212 -207
- data/lib/statsample/bivariate.rb +6 -6
- data/lib/statsample/codification.rb +65 -65
- data/lib/statsample/combination.rb +60 -59
- data/lib/statsample/converter/csv19.rb +12 -12
- data/lib/statsample/converters.rb +1 -1
- data/lib/statsample/dataset.rb +93 -36
- data/lib/statsample/dominanceanalysis/bootstrap.rb +66 -3
- data/lib/statsample/dominanceanalysis.rb +5 -6
- data/lib/statsample/factor/pca.rb +41 -11
- data/lib/statsample/factor/principalaxis.rb +105 -29
- data/lib/statsample/factor/rotation.rb +20 -3
- data/lib/statsample/factor.rb +1 -1
- data/lib/statsample/graph/gdchart.rb +13 -13
- data/lib/statsample/graph/svggraph.rb +166 -167
- data/lib/statsample/matrix.rb +22 -12
- data/lib/statsample/mle/logit.rb +3 -2
- data/lib/statsample/mle/probit.rb +7 -5
- data/lib/statsample/mle.rb +4 -2
- data/lib/statsample/multiset.rb +125 -124
- data/lib/statsample/permutation.rb +2 -1
- data/lib/statsample/regression/binomial/logit.rb +4 -3
- data/lib/statsample/regression/binomial/probit.rb +2 -1
- data/lib/statsample/regression/binomial.rb +62 -81
- data/lib/statsample/regression/multiple/baseengine.rb +1 -1
- data/lib/statsample/regression/multiple/gslengine.rb +1 -1
- data/lib/statsample/regression/multiple/matrixengine.rb +12 -6
- data/lib/statsample/regression/multiple.rb +15 -42
- data/lib/statsample/regression/simple.rb +93 -78
- data/lib/statsample/regression.rb +74 -2
- data/lib/statsample/reliability.rb +117 -120
- data/lib/statsample/srs.rb +156 -153
- data/lib/statsample/test/levene.rb +90 -0
- data/lib/statsample/test/umannwhitney.rb +25 -9
- data/lib/statsample/test.rb +2 -0
- data/lib/statsample/vector.rb +388 -413
- data/lib/statsample.rb +74 -30
- data/po/es/statsample.mo +0 -0
- data/test/test_bivariate.rb +5 -4
- data/test/test_combination.rb +1 -1
- data/test/test_dataset.rb +2 -2
- data/test/test_factor.rb +53 -6
- data/test/test_gsl.rb +1 -1
- data/test/test_mle.rb +1 -1
- data/test/test_regression.rb +18 -33
- data/test/test_statistics.rb +15 -33
- data/test/test_stest.rb +35 -0
- data/test/test_svg_graph.rb +2 -2
- data/test/test_vector.rb +331 -333
- metadata +38 -11
data/lib/statsample/dataset.rb
CHANGED
@@ -1,22 +1,23 @@
|
|
1
1
|
require 'statsample/vector'
|
2
2
|
|
3
3
|
class Hash
|
4
|
+
# Creates a Statsample::Dataset based on a Hash
|
4
5
|
def to_dataset(*args)
|
5
6
|
Statsample::Dataset.new(self,*args)
|
6
7
|
end
|
7
8
|
end
|
8
9
|
|
9
10
|
class Array
|
10
|
-
def prefix(s)
|
11
|
+
def prefix(s) # :nodoc:
|
11
12
|
self.collect{|c| s+c.to_s }
|
12
13
|
end
|
13
|
-
def suffix(s)
|
14
|
+
def suffix(s) # :nodoc:
|
14
15
|
self.collect{|c| c.to_s+s }
|
15
16
|
end
|
16
17
|
end
|
17
18
|
|
18
19
|
module Statsample
|
19
|
-
class DatasetException < RuntimeError
|
20
|
+
class DatasetException < RuntimeError # :nodoc:
|
20
21
|
attr_reader :ds,:exp
|
21
22
|
def initialize(ds,e)
|
22
23
|
@ds=ds
|
@@ -28,15 +29,49 @@ module Statsample
|
|
28
29
|
m
|
29
30
|
end
|
30
31
|
end
|
32
|
+
# Set of cases with values for one or more variables,
|
33
|
+
# analog to a dataframe on R or a standard data file of SPSS.
|
34
|
+
# Every vector has <tt>#field</tt> name, which represent it. By default,
|
35
|
+
# the vectors are ordered by it field name, but you can change it
|
36
|
+
# the fields order manually.
|
37
|
+
# The Dataset work as a Hash, with keys are field names
|
38
|
+
# and values are Statsample::Vector
|
39
|
+
#
|
40
|
+
#
|
41
|
+
# ==Usage
|
42
|
+
# Create a empty dataset
|
43
|
+
# Dataset.new()
|
44
|
+
# Create a dataset with three empty vectors, called <tt>v1</tt>, <tt>v2</tt> and <tt>v3</tt>
|
45
|
+
# Dataset.new(%w{v1 v2 v3})
|
46
|
+
# Create a dataset with two vectors
|
47
|
+
# Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector})
|
48
|
+
# Create a dataset with two given vectors (v1 and v2), with vectors on inverted order
|
49
|
+
# Dataset.new({'v2'=>v2,'v1'=>v1},['v2','v1'])
|
50
|
+
#
|
51
|
+
# The fast way to create a dataset uses Hash#to_dataset, with
|
52
|
+
# field order as arguments
|
53
|
+
# v1 = [1,2,3].to_scale
|
54
|
+
# v2 = [1,2,3].to_scale
|
55
|
+
# ds = {'v1'=>v2, 'v2'=>v2}.to_dataset(%w{v2 v1})
|
56
|
+
|
31
57
|
class Dataset
|
32
58
|
include Writable
|
33
|
-
|
59
|
+
# Hash of Statsample::Vector
|
60
|
+
attr_reader :vectors
|
61
|
+
# Ordered names of vectors
|
62
|
+
attr_reader :fields
|
63
|
+
# Number of cases
|
64
|
+
attr_reader :cases
|
65
|
+
# Location of pointer on enumerations methods (like #each)
|
66
|
+
attr_reader :i
|
67
|
+
# Deprecated: Label of vectors
|
34
68
|
attr_accessor :labels
|
35
69
|
|
36
70
|
# Generates a new dataset, using three vectors
|
37
71
|
# - Rows
|
38
72
|
# - Columns
|
39
73
|
# - Values
|
74
|
+
#
|
40
75
|
# For example, you have these values
|
41
76
|
#
|
42
77
|
# x y v
|
@@ -88,16 +123,7 @@ module Statsample
|
|
88
123
|
# order of variables. If empty, vectors keys on alfabethic order as
|
89
124
|
# used as fields
|
90
125
|
# [labels] Hash to set names for fields.
|
91
|
-
|
92
|
-
#
|
93
|
-
# Dataset.new()
|
94
|
-
# Dataset.new(%w{v1 v2 v3})
|
95
|
-
# Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector})
|
96
|
-
# Dataset.new({'v2'=>v2,'v1'=>v1},['v1','v2'])
|
97
|
-
#
|
98
|
-
# The fast way to create a dataset uses Hash#to_dataset, with
|
99
|
-
# fields and labels as arguments
|
100
|
-
# ds = {'v1'=>[1,2,3].to_vector}.to_dataset
|
126
|
+
|
101
127
|
#
|
102
128
|
def initialize(vectors={}, fields=[], labels={})
|
103
129
|
if vectors.instance_of? Array
|
@@ -120,7 +146,8 @@ module Statsample
|
|
120
146
|
end
|
121
147
|
matrix
|
122
148
|
end
|
123
|
-
|
149
|
+
# Retrieves label for a vector, giving a field name.
|
150
|
+
def label(v_id)
|
124
151
|
raise "Vector #{v} doesn't exists" unless @fields.include? v_id
|
125
152
|
@labels[v_id].nil? ? v_id : @labels[v_id]
|
126
153
|
end
|
@@ -233,12 +260,20 @@ module Statsample
|
|
233
260
|
ds_boot.update_valid_data
|
234
261
|
ds_boot
|
235
262
|
end
|
236
|
-
# Fast version of
|
263
|
+
# Fast version of #add_case.
|
237
264
|
# Can only add one case and no error check if performed
|
238
|
-
# You SHOULD use update_valid_data at the end of insertion cycle
|
265
|
+
# You SHOULD use #update_valid_data at the end of insertion cycle
|
239
266
|
def add_case_array(v)
|
240
267
|
v.each_index {|i| d=@vectors[@fields[i]].data; d.push(v[i])}
|
241
268
|
end
|
269
|
+
# Insert a case, using:
|
270
|
+
# * Array: size equal to number of vectors and values in the same order as fields
|
271
|
+
# * Hash: keys equal to fields
|
272
|
+
# If uvd is false, #update_valid_data is not executed after
|
273
|
+
# inserting a case. This is very useful if you want to increase the
|
274
|
+
# performance on inserting many cases,
|
275
|
+
# because #update_valid_data performs check on vectors and on the dataset
|
276
|
+
|
242
277
|
def add_case(v,uvd=true)
|
243
278
|
case v
|
244
279
|
when Array
|
@@ -258,14 +293,18 @@ module Statsample
|
|
258
293
|
update_valid_data
|
259
294
|
end
|
260
295
|
end
|
296
|
+
# Check vectors and fields after inserting data. Use only
|
297
|
+
# after #add_case_array or #add_case with second parameter to false
|
261
298
|
def update_valid_data
|
262
299
|
@fields.each{|f| @vectors[f].set_valid_data}
|
263
300
|
check_length
|
264
301
|
end
|
302
|
+
# Delete a vector
|
265
303
|
def delete_vector(name)
|
266
304
|
@fields.delete(name)
|
267
305
|
@vectors.delete(name)
|
268
306
|
end
|
307
|
+
|
269
308
|
def add_vectors_by_split_recode(name,join='-',sep=Statsample::SPLIT_TOKEN)
|
270
309
|
split=@vectors[name].split_by_separator(sep)
|
271
310
|
i=1
|
@@ -294,7 +333,7 @@ module Statsample
|
|
294
333
|
def vector_sum(fields=nil)
|
295
334
|
a=[]
|
296
335
|
fields||=@fields
|
297
|
-
collect_with_index do |i
|
336
|
+
collect_with_index do |row, i|
|
298
337
|
if(fields.find{|f| !@vectors[f].data_with_nils[i]})
|
299
338
|
nil
|
300
339
|
else
|
@@ -302,16 +341,17 @@ module Statsample
|
|
302
341
|
end
|
303
342
|
end
|
304
343
|
end
|
344
|
+
# Check if #fields attribute is correct, after inserting or deleting vectors
|
305
345
|
def check_fields(fields)
|
306
346
|
fields||=@fields
|
307
347
|
raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0
|
308
348
|
fields
|
309
349
|
end
|
350
|
+
|
310
351
|
# Returns a vector with the numbers of missing values for a case
|
311
|
-
|
312
352
|
def vector_missing_values(fields=nil)
|
313
353
|
fields=check_fields(fields)
|
314
|
-
collect_with_index do |i
|
354
|
+
collect_with_index do |row, i|
|
315
355
|
fields.inject(0) {|a,v|
|
316
356
|
a+ ((@vectors[v].data_with_nils[i].nil?) ? 1: 0)
|
317
357
|
}
|
@@ -319,9 +359,8 @@ module Statsample
|
|
319
359
|
end
|
320
360
|
def vector_count_characters(fields=nil)
|
321
361
|
fields=check_fields(fields)
|
322
|
-
collect_with_index do |i
|
362
|
+
collect_with_index do |row, i|
|
323
363
|
fields.inject(0){|a,v|
|
324
|
-
|
325
364
|
a+((@vectors[v].data_with_nils[i].nil?) ? 0: row[v].to_s.size)
|
326
365
|
}
|
327
366
|
end
|
@@ -353,7 +392,8 @@ module Statsample
|
|
353
392
|
end
|
354
393
|
a.to_vector(:scale)
|
355
394
|
end
|
356
|
-
|
395
|
+
# Check vectors for type and size.
|
396
|
+
def check_length # :nodoc:
|
357
397
|
size=nil
|
358
398
|
@vectors.each do |k,v|
|
359
399
|
raise Exception, "Data #{v.class} is not a vector on key #{k}" if !v.is_a? Statsample::Vector
|
@@ -368,16 +408,19 @@ module Statsample
|
|
368
408
|
end
|
369
409
|
@cases=size
|
370
410
|
end
|
371
|
-
|
372
|
-
|
411
|
+
# Retrieves each vector as [key, vector]
|
412
|
+
def each_vector # :yield: |key, vector|
|
413
|
+
@fields.each{|k| yield k, @vectors[k]}
|
373
414
|
end
|
415
|
+
|
374
416
|
if Statsample::STATSAMPLE__.respond_to?(:case_as_hash)
|
375
417
|
def case_as_hash(c) # :nodoc:
|
376
418
|
Statsample::STATSAMPLE__.case_as_hash(self,c)
|
377
419
|
end
|
378
420
|
else
|
379
|
-
|
380
|
-
|
421
|
+
# Retrieves case i as a hash
|
422
|
+
def case_as_hash(i)
|
423
|
+
_case_as_hash(i)
|
381
424
|
end
|
382
425
|
end
|
383
426
|
|
@@ -386,8 +429,9 @@ module Statsample
|
|
386
429
|
Statsample::STATSAMPLE__.case_as_array(self,c)
|
387
430
|
end
|
388
431
|
else
|
389
|
-
|
390
|
-
|
432
|
+
# Retrieves case i as a array, ordered on #fields order
|
433
|
+
def case_as_array(i)
|
434
|
+
_case_as_array(i)
|
391
435
|
end
|
392
436
|
end
|
393
437
|
def _case_as_hash(c) # :nodoc:
|
@@ -396,6 +440,7 @@ module Statsample
|
|
396
440
|
def _case_as_array(c) # :nodoc:
|
397
441
|
@fields.collect {|x| @vectors[x][c]}
|
398
442
|
end
|
443
|
+
|
399
444
|
# Returns each case as a hash
|
400
445
|
def each
|
401
446
|
begin
|
@@ -411,7 +456,7 @@ module Statsample
|
|
411
456
|
end
|
412
457
|
end
|
413
458
|
# Returns each case as hash and index
|
414
|
-
def each_with_index
|
459
|
+
def each_with_index # :yield: |case, i|
|
415
460
|
begin
|
416
461
|
@i=0
|
417
462
|
@cases.times{|i|
|
@@ -447,6 +492,7 @@ module Statsample
|
|
447
492
|
}
|
448
493
|
@i=nil
|
449
494
|
end
|
495
|
+
# Set fields order. If you omit one or more vectors,
|
450
496
|
def fields=(f)
|
451
497
|
@fields=f
|
452
498
|
check_order
|
@@ -470,6 +516,8 @@ module Statsample
|
|
470
516
|
raise ArgumentError, "You need a String or a Range"
|
471
517
|
end
|
472
518
|
end
|
519
|
+
# Retrieves a Statsample::Vector, based on the result
|
520
|
+
# of calculation performed on each case.
|
473
521
|
def collect(type=:scale)
|
474
522
|
data=[]
|
475
523
|
each {|row|
|
@@ -477,10 +525,11 @@ module Statsample
|
|
477
525
|
}
|
478
526
|
Statsample::Vector.new(data,type)
|
479
527
|
end
|
528
|
+
# Same as #collect, but giving case index as second parameter on yield.
|
480
529
|
def collect_with_index(type=:scale)
|
481
530
|
data=[]
|
482
531
|
each_with_index {|row, i|
|
483
|
-
data.push(yield(i
|
532
|
+
data.push(yield(row, i))
|
484
533
|
}
|
485
534
|
Statsample::Vector.new(data,type)
|
486
535
|
end
|
@@ -504,6 +553,8 @@ module Statsample
|
|
504
553
|
raise ArgumentError,"Should pass a Statsample::Vector"
|
505
554
|
end
|
506
555
|
end
|
556
|
+
# Return data as a matrix. Column are ordered by #fields and
|
557
|
+
# rows by orden of insertion
|
507
558
|
def to_matrix
|
508
559
|
rows=[]
|
509
560
|
self.each_array{|c|
|
@@ -511,7 +562,8 @@ module Statsample
|
|
511
562
|
}
|
512
563
|
Matrix.rows(rows)
|
513
564
|
end
|
514
|
-
|
565
|
+
|
566
|
+
if Statsample.has_gsl?
|
515
567
|
def to_matrix_gsl
|
516
568
|
rows=[]
|
517
569
|
self.each_array{|c|
|
@@ -520,15 +572,17 @@ module Statsample
|
|
520
572
|
GSL::Matrix.alloc(*rows)
|
521
573
|
end
|
522
574
|
end
|
523
|
-
|
575
|
+
|
576
|
+
def to_multiset_by_split(*fields)
|
524
577
|
require 'statsample/multiset'
|
525
578
|
if fields.size==1
|
526
579
|
to_multiset_by_split_one_field(fields[0])
|
527
580
|
else
|
528
581
|
to_multiset_by_split_multiple_fields(*fields)
|
529
582
|
end
|
530
|
-
|
531
|
-
|
583
|
+
end
|
584
|
+
|
585
|
+
# Create a new dataset with all cases which the block returns true
|
532
586
|
def filter
|
533
587
|
ds=self.dup_empty
|
534
588
|
each {|c|
|
@@ -537,6 +591,7 @@ module Statsample
|
|
537
591
|
ds.update_valid_data
|
538
592
|
ds
|
539
593
|
end
|
594
|
+
|
540
595
|
# creates a new vector with the data of a given field which the block returns true
|
541
596
|
def filter_field(field)
|
542
597
|
a=[]
|
@@ -545,6 +600,7 @@ module Statsample
|
|
545
600
|
}
|
546
601
|
a.to_vector(@vectors[field].type)
|
547
602
|
end
|
603
|
+
|
548
604
|
def to_multiset_by_split_one_field(field)
|
549
605
|
raise ArgumentError,"Should use a correct field name" if !@fields.include? field
|
550
606
|
factors=@vectors[field].factors
|
@@ -604,7 +660,7 @@ module Statsample
|
|
604
660
|
text.gsub!(f,"row['#{f}']")
|
605
661
|
end
|
606
662
|
}
|
607
|
-
collect_with_index {|i
|
663
|
+
collect_with_index {|row, i|
|
608
664
|
invalid=false
|
609
665
|
@fields.each{|f|
|
610
666
|
if @vectors[f].data_with_nils[i].nil?
|
@@ -653,6 +709,7 @@ module Statsample
|
|
653
709
|
end
|
654
710
|
# Creates a new dataset for one to many relations
|
655
711
|
# on a dataset, based on pattern of field names.
|
712
|
+
#
|
656
713
|
# for example, you have a survey for number of children
|
657
714
|
# with this structure:
|
658
715
|
# id, name, child_name_1, child_age_1, child_name_2, child_age_2
|
@@ -1,8 +1,70 @@
|
|
1
1
|
module Statsample
|
2
2
|
class DominanceAnalysis
|
3
|
+
# == Goal
|
3
4
|
# Generates Bootstrap sample to identity the replicability of a Dominance Analysis. See Azen & Bodescu (2003) for more information.
|
4
|
-
#
|
5
|
-
#
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
#
|
8
|
+
# require 'statsample'
|
9
|
+
# a=100.times.collect {rand}.to_scale
|
10
|
+
# b=100.times.collect {rand}.to_scale
|
11
|
+
# c=100.times.collect {rand}.to_scale
|
12
|
+
# d=100.times.collect {rand}.to_scale
|
13
|
+
# ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset
|
14
|
+
# ds['y']=ds.collect{|row| row['a']*5+row['b']*2+row['c']*2+row['d']*2+10*rand()}
|
15
|
+
# dab=Statsample::DominanceAnalysis::Bootstrap.new(ds2, 'y', :debug=>true)
|
16
|
+
# dab.bootstrap(100,nil)
|
17
|
+
# puts dab.summary
|
18
|
+
# <strong>Output</strong>
|
19
|
+
# Sample size: 100
|
20
|
+
# t: 1.98421693632958
|
21
|
+
#
|
22
|
+
# Linear Regression Engine: Statsample::Regression::Multiple::MatrixEngine
|
23
|
+
# Table: Bootstrap report
|
24
|
+
# --------------------------------------------------------------------------------------------
|
25
|
+
# | pairs | sD | Dij | SE(Dij) | Pij | Pji | Pno | Reproducibility |
|
26
|
+
# --------------------------------------------------------------------------------------------
|
27
|
+
# | Complete dominance |
|
28
|
+
# --------------------------------------------------------------------------------------------
|
29
|
+
# | a - b | 1.0 | 0.6150 | 0.454 | 0.550 | 0.320 | 0.130 | 0.550 |
|
30
|
+
# | a - c | 1.0 | 0.9550 | 0.175 | 0.930 | 0.020 | 0.050 | 0.930 |
|
31
|
+
# | a - d | 1.0 | 0.9750 | 0.131 | 0.960 | 0.010 | 0.030 | 0.960 |
|
32
|
+
# | b - c | 1.0 | 0.8800 | 0.276 | 0.820 | 0.060 | 0.120 | 0.820 |
|
33
|
+
# | b - d | 1.0 | 0.9250 | 0.193 | 0.860 | 0.010 | 0.130 | 0.860 |
|
34
|
+
# | c - d | 0.5 | 0.5950 | 0.346 | 0.350 | 0.160 | 0.490 | 0.490 |
|
35
|
+
# --------------------------------------------------------------------------------------------
|
36
|
+
# | Conditional dominance |
|
37
|
+
# --------------------------------------------------------------------------------------------
|
38
|
+
# | a - b | 1.0 | 0.6300 | 0.458 | 0.580 | 0.320 | 0.100 | 0.580 |
|
39
|
+
# | a - c | 1.0 | 0.9700 | 0.156 | 0.960 | 0.020 | 0.020 | 0.960 |
|
40
|
+
# | a - d | 1.0 | 0.9800 | 0.121 | 0.970 | 0.010 | 0.020 | 0.970 |
|
41
|
+
# | b - c | 1.0 | 0.8850 | 0.283 | 0.840 | 0.070 | 0.090 | 0.840 |
|
42
|
+
# | b - d | 1.0 | 0.9500 | 0.181 | 0.920 | 0.020 | 0.060 | 0.920 |
|
43
|
+
# | c - d | 0.5 | 0.5800 | 0.360 | 0.350 | 0.190 | 0.460 | 0.460 |
|
44
|
+
# --------------------------------------------------------------------------------------------
|
45
|
+
# | General Dominance |
|
46
|
+
# --------------------------------------------------------------------------------------------
|
47
|
+
# | a - b | 1.0 | 0.6500 | 0.479 | 0.650 | 0.350 | 0.000 | 0.650 |
|
48
|
+
# | a - c | 1.0 | 0.9800 | 0.141 | 0.980 | 0.020 | 0.000 | 0.980 |
|
49
|
+
# | a - d | 1.0 | 0.9900 | 0.100 | 0.990 | 0.010 | 0.000 | 0.990 |
|
50
|
+
# | b - c | 1.0 | 0.9000 | 0.302 | 0.900 | 0.100 | 0.000 | 0.900 |
|
51
|
+
# | b - d | 1.0 | 0.9700 | 0.171 | 0.970 | 0.030 | 0.000 | 0.970 |
|
52
|
+
# | c - d | 1.0 | 0.5600 | 0.499 | 0.560 | 0.440 | 0.000 | 0.560 |
|
53
|
+
# --------------------------------------------------------------------------------------------
|
54
|
+
#
|
55
|
+
# Table: General averages
|
56
|
+
# ---------------------------------------
|
57
|
+
# | var | mean | se | p.5 | p.95 |
|
58
|
+
# ---------------------------------------
|
59
|
+
# | a | 0.133 | 0.049 | 0.062 | 0.218 |
|
60
|
+
# | b | 0.106 | 0.048 | 0.029 | 0.199 |
|
61
|
+
# | c | 0.035 | 0.032 | 0.002 | 0.106 |
|
62
|
+
# | d | 0.023 | 0.019 | 0.002 | 0.062 |
|
63
|
+
# ---------------------------------------
|
64
|
+
#
|
65
|
+
# == References:
|
66
|
+
#
|
67
|
+
# * Azen, R. & Budescu, D.V. (2003). The dominance analysis approach for comparing predictors in multiple regression. <em>Psychological Methods, 8</em>(2), 129-148.
|
6
68
|
class Bootstrap
|
7
69
|
include GetText
|
8
70
|
include Writable
|
@@ -27,12 +89,13 @@ module Statsample
|
|
27
89
|
attr_accessor :alpha
|
28
90
|
# Debug?
|
29
91
|
attr_accessor :debug
|
92
|
+
# Default level of confidence for t calculation
|
93
|
+
ALPHA=0.95
|
30
94
|
# Create a new Dominance Analysis Bootstrap Object
|
31
95
|
#
|
32
96
|
# * ds: A Dataset object
|
33
97
|
# * y_var: Name of dependent variable
|
34
98
|
# * opts: Any other attribute of the class
|
35
|
-
ALPHA=0.95
|
36
99
|
def initialize(ds,y_var, opts=Hash.new)
|
37
100
|
@ds=ds
|
38
101
|
@y_var=y_var
|
@@ -1,13 +1,12 @@
|
|
1
1
|
require 'statsample/dominanceanalysis/bootstrap'
|
2
2
|
module Statsample
|
3
|
-
# Dominance Analysis is a procedure based on an examination of the R
|
3
|
+
# Dominance Analysis is a procedure based on an examination of the R<sup>2</sup> values
|
4
4
|
# for all possible subset models, to identify the relevance of one or more
|
5
5
|
# predictors in the prediction of criterium.
|
6
6
|
#
|
7
|
-
#
|
8
7
|
# See Budescu(1993), Azen & Budescu (2003, 2006) for more information.
|
9
8
|
#
|
10
|
-
#
|
9
|
+
# == Use
|
11
10
|
#
|
12
11
|
# a=1000.times.collect {rand}.to_scale
|
13
12
|
# b=1000.times.collect {rand}.to_scale
|
@@ -17,7 +16,7 @@ module Statsample
|
|
17
16
|
# da=Statsample::DominanceAnalysis.new(ds,'y')
|
18
17
|
# puts da.summary
|
19
18
|
#
|
20
|
-
# Output:
|
19
|
+
# === Output:
|
21
20
|
#
|
22
21
|
# Report: Report 2010-02-08 19:10:11 -0300
|
23
22
|
# Table: Dominance Analysis result
|
@@ -51,12 +50,12 @@ module Statsample
|
|
51
50
|
# | a - c | 1.0 | 1.0 | 1.0 |
|
52
51
|
# | b - c | 1.0 | 1.0 | 1.0 |
|
53
52
|
# -----------------------------------------
|
54
|
-
|
55
53
|
#
|
56
54
|
# == References:
|
57
55
|
# * Budescu, D. V. (1993). Dominance analysis: a new approach to the problem of relative importance of predictors in multiple regression. <em>Psychological Bulletin, 114</em>, 542-551.
|
58
56
|
# * Azen, R. & Budescu, D.V. (2003). The dominance analysis approach for comparing predictors in multiple regression. <em>Psychological Methods, 8</em>(2), 129-148.
|
59
57
|
# * Azen, R. & Budescu, D.V. (2006). Comparing predictors in Multivariate Regression Models: An extension of Dominance Analysis. <em>Journal of Educational and Behavioral Statistics, 31</em>(2), 157-180.
|
58
|
+
#
|
60
59
|
class DominanceAnalysis
|
61
60
|
include GetText
|
62
61
|
bindtextdomain("statsample")
|
@@ -366,7 +365,7 @@ module Statsample
|
|
366
365
|
generator.parse_element(t)
|
367
366
|
generator.add_html("</div>")
|
368
367
|
end
|
369
|
-
class ModelData
|
368
|
+
class ModelData # :nodoc:
|
370
369
|
attr_reader :contributions
|
371
370
|
def initialize(independent, data, da)
|
372
371
|
@independent=independent
|
@@ -1,21 +1,42 @@
|
|
1
1
|
module Statsample
|
2
2
|
module Factor
|
3
|
-
# Principal Component Analysis of a
|
4
|
-
#
|
5
|
-
#
|
6
|
-
#
|
3
|
+
# Principal Component Analysis (PCA) of a
|
4
|
+
# covariance or correlation matrix.
|
5
|
+
#
|
6
|
+
# For Principal Axis Analysis, use Statsample::Factor::PrincipalAxis
|
7
|
+
#
|
8
|
+
# == Usage:
|
9
|
+
# require 'statsample'
|
7
10
|
# a=[2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1].to_scale
|
8
11
|
# b=[2.4,0.7,2.9,2.2,3.0,2.7,1.6,1.1,1.6,0.9].to_scale
|
9
12
|
# ds={'a'=>a,'b'=>b}.to_dataset
|
10
13
|
# cor_matrix=Statsample::Bivariate.correlation_matrix(ds)
|
11
14
|
# pca=Statsample::Factor::PCA.new(cor_matrix)
|
12
|
-
#
|
15
|
+
# pca.m
|
16
|
+
# => 1
|
17
|
+
# pca.eigenvalues
|
18
|
+
# => [1.92592927269225, 0.0740707273077545]
|
19
|
+
# pca.component_matrix
|
20
|
+
# => GSL::Matrix
|
21
|
+
# [ 9.813e-01
|
22
|
+
# 9.813e-01 ]
|
23
|
+
# pca.communalities
|
24
|
+
# => [0.962964636346122, 0.962964636346122]
|
25
|
+
#
|
26
|
+
# == References:
|
27
|
+
#
|
28
|
+
# * SPSS manual
|
29
|
+
# * Smith, L. (2002). A tutorial on Principal Component Analysis. Available on http://courses.eas.ualberta.ca/eas570/pca_tutorial.pdf
|
30
|
+
#
|
13
31
|
class PCA
|
14
|
-
|
32
|
+
# Name of analysis
|
33
|
+
attr_accessor :name
|
34
|
+
# Number of factors. Set by default to the number of factors
|
35
|
+
# with eigen values > 1
|
36
|
+
attr_accessor :m
|
15
37
|
include GetText
|
16
38
|
bindtextdomain("statsample")
|
17
39
|
|
18
|
-
|
19
40
|
def initialize(matrix ,opts=Hash.new)
|
20
41
|
if matrix.respond_to? :to_gsl
|
21
42
|
matrix=matrix.to_gsl
|
@@ -42,6 +63,7 @@ module Factor
|
|
42
63
|
}
|
43
64
|
@ds=h.to_dataset
|
44
65
|
end
|
66
|
+
|
45
67
|
# Feature vector for m factors
|
46
68
|
def feature_vector(m=nil)
|
47
69
|
m||=@m
|
@@ -69,10 +91,10 @@ module Factor
|
|
69
91
|
gammas.push(Math::sqrt(@eigenpairs[i][0]))
|
70
92
|
}
|
71
93
|
gamma_m=GSL::Matrix.diagonal(gammas)
|
72
|
-
omega_m*(gamma_m)
|
94
|
+
(omega_m*(gamma_m)).to_matrix
|
73
95
|
end
|
74
|
-
#
|
75
|
-
def
|
96
|
+
# Communalities for all variables given m factors
|
97
|
+
def communalities(m=nil)
|
76
98
|
m||=@m
|
77
99
|
h=[]
|
78
100
|
@n_variables.times do |i|
|
@@ -84,9 +106,11 @@ module Factor
|
|
84
106
|
end
|
85
107
|
h
|
86
108
|
end
|
109
|
+
# Array with eigenvalues
|
87
110
|
def eigenvalues
|
88
111
|
@eigenpairs.collect {|c| c[0] }
|
89
112
|
end
|
113
|
+
|
90
114
|
def calculate_eigenpairs
|
91
115
|
eigval, eigvec= GSL::Eigen.symmv(@matrix)
|
92
116
|
@eigenpairs={}
|
@@ -95,13 +119,18 @@ module Factor
|
|
95
119
|
}
|
96
120
|
@eigenpairs=@eigenpairs.sort.reverse
|
97
121
|
end
|
122
|
+
def summary
|
123
|
+
rp=ReportBuilder.new()
|
124
|
+
rp.add(self)
|
125
|
+
rp.to_text
|
126
|
+
end
|
98
127
|
def to_reportbuilder(generator) # :nodoc:
|
99
128
|
anchor=generator.add_toc_entry(_("PCA: ")+name)
|
100
129
|
generator.add_html "<div class='pca'>"+_("PCA")+" #{@name}<a name='#{anchor}'></a>"
|
101
130
|
|
102
131
|
generator.add_text "Number of factors: #{m}"
|
103
132
|
t=ReportBuilder::Table.new(:name=>_("Communalities"), :header=>["Variable","Initial","Extraction"])
|
104
|
-
|
133
|
+
communalities(m).each_with_index {|com,i|
|
105
134
|
t.add_row([i, 1.0, sprintf("%0.3f", com)])
|
106
135
|
}
|
107
136
|
generator.parse_element(t)
|
@@ -122,6 +151,7 @@ module Factor
|
|
122
151
|
generator.parse_element(t)
|
123
152
|
generator.add_html("</div>")
|
124
153
|
end
|
154
|
+
private :calculate_eigenpairs, :create_centered_ds
|
125
155
|
end
|
126
156
|
end
|
127
157
|
end
|