statsample 1.5.0 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.build.sh +15 -0
- data/.gitignore +1 -0
- data/.travis.yml +19 -7
- data/CONTRIBUTING.md +33 -0
- data/History.txt +5 -0
- data/README.md +41 -53
- data/benchmarks/correlation_matrix_15_variables.rb +6 -5
- data/benchmarks/correlation_matrix_5_variables.rb +6 -5
- data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +23 -26
- data/examples/boxplot.rb +17 -5
- data/examples/correlation_matrix.rb +36 -7
- data/examples/dataset.rb +25 -5
- data/examples/dominance_analysis.rb +8 -7
- data/examples/dominance_analysis_bootstrap.rb +16 -11
- data/examples/histogram.rb +16 -2
- data/examples/icc.rb +5 -6
- data/examples/levene.rb +17 -3
- data/examples/multiple_regression.rb +6 -3
- data/examples/parallel_analysis.rb +11 -6
- data/examples/polychoric.rb +26 -13
- data/examples/principal_axis.rb +8 -4
- data/examples/reliability.rb +10 -10
- data/examples/scatterplot.rb +8 -0
- data/examples/t_test.rb +7 -0
- data/examples/u_test.rb +10 -2
- data/examples/vector.rb +9 -6
- data/examples/velicer_map_test.rb +12 -8
- data/lib/statsample.rb +13 -47
- data/lib/statsample/analysis/suite.rb +1 -1
- data/lib/statsample/anova/oneway.rb +6 -6
- data/lib/statsample/anova/twoway.rb +26 -24
- data/lib/statsample/bivariate.rb +78 -61
- data/lib/statsample/bivariate/pearson.rb +2 -2
- data/lib/statsample/codification.rb +45 -32
- data/lib/statsample/converter/csv.rb +15 -53
- data/lib/statsample/converter/spss.rb +6 -5
- data/lib/statsample/converters.rb +50 -211
- data/lib/statsample/crosstab.rb +26 -25
- data/lib/statsample/daru.rb +117 -0
- data/lib/statsample/dataset.rb +70 -942
- data/lib/statsample/dominanceanalysis.rb +16 -17
- data/lib/statsample/dominanceanalysis/bootstrap.rb +26 -28
- data/lib/statsample/factor/parallelanalysis.rb +17 -19
- data/lib/statsample/factor/pca.rb +21 -20
- data/lib/statsample/factor/principalaxis.rb +3 -3
- data/lib/statsample/graph/boxplot.rb +8 -16
- data/lib/statsample/graph/histogram.rb +4 -4
- data/lib/statsample/graph/scatterplot.rb +8 -7
- data/lib/statsample/histogram.rb +128 -119
- data/lib/statsample/matrix.rb +20 -16
- data/lib/statsample/multiset.rb +39 -38
- data/lib/statsample/regression.rb +3 -3
- data/lib/statsample/regression/multiple.rb +8 -10
- data/lib/statsample/regression/multiple/alglibengine.rb +96 -89
- data/lib/statsample/regression/multiple/baseengine.rb +32 -32
- data/lib/statsample/regression/multiple/gslengine.rb +33 -36
- data/lib/statsample/regression/multiple/matrixengine.rb +7 -9
- data/lib/statsample/regression/multiple/rubyengine.rb +39 -41
- data/lib/statsample/reliability.rb +23 -25
- data/lib/statsample/reliability/icc.rb +8 -7
- data/lib/statsample/reliability/multiscaleanalysis.rb +14 -12
- data/lib/statsample/reliability/scaleanalysis.rb +58 -60
- data/lib/statsample/reliability/skillscaleanalysis.rb +34 -29
- data/lib/statsample/resample.rb +1 -1
- data/lib/statsample/shorthand.rb +29 -25
- data/lib/statsample/test/kolmogorovsmirnov.rb +5 -3
- data/lib/statsample/test/levene.rb +28 -27
- data/lib/statsample/test/t.rb +7 -9
- data/lib/statsample/test/umannwhitney.rb +28 -28
- data/lib/statsample/test/wilcoxonsignedrank.rb +45 -43
- data/lib/statsample/vector.rb +70 -1013
- data/lib/statsample/version.rb +1 -1
- data/statsample.gemspec +12 -16
- data/test/helpers_tests.rb +1 -1
- data/test/test_analysis.rb +17 -17
- data/test/test_anova_contrast.rb +6 -6
- data/test/test_anovatwowaywithdataset.rb +8 -8
- data/test/test_anovawithvectors.rb +8 -8
- data/test/test_awesome_print_bug.rb +1 -1
- data/test/test_bartlettsphericity.rb +4 -4
- data/test/test_bivariate.rb +48 -43
- data/test/test_codification.rb +33 -33
- data/test/test_crosstab.rb +9 -9
- data/test/test_dataset.rb +28 -458
- data/test/test_factor.rb +46 -38
- data/test/test_factor_pa.rb +22 -13
- data/test/test_ggobi.rb +4 -4
- data/test/test_gsl.rb +4 -4
- data/test/test_histogram.rb +3 -3
- data/test/test_matrix.rb +13 -13
- data/test/test_multiset.rb +103 -91
- data/test/test_regression.rb +57 -52
- data/test/test_reliability.rb +55 -45
- data/test/test_reliability_icc.rb +8 -8
- data/test/test_reliability_skillscale.rb +26 -24
- data/test/test_resample.rb +1 -1
- data/test/test_statistics.rb +3 -13
- data/test/test_stest.rb +9 -9
- data/test/test_stratified.rb +3 -3
- data/test/test_test_t.rb +12 -12
- data/test/test_umannwhitney.rb +2 -2
- data/test/test_vector.rb +76 -613
- data/test/test_wilcoxonsignedrank.rb +4 -4
- metadata +57 -28
- data/lib/statsample/rserve_extension.rb +0 -20
- data/lib/statsample/vector/gsl.rb +0 -106
- data/test/fixtures/repeated_fields.csv +0 -7
- data/test/fixtures/scientific_notation.csv +0 -4
- data/test/fixtures/test_csv.csv +0 -7
- data/test/fixtures/test_xls.xls +0 -0
- data/test/test_csv.rb +0 -63
- data/test/test_rserve_extension.rb +0 -42
- data/test/test_xls.rb +0 -52
@@ -4,11 +4,11 @@ module Statsample
|
|
4
4
|
# Given a dataset with results and a correct answers hash,
|
5
5
|
# generates a ScaleAnalysis
|
6
6
|
# == Usage
|
7
|
-
# x1
|
8
|
-
# x2
|
9
|
-
# x3
|
10
|
-
# ds={
|
11
|
-
# key={
|
7
|
+
# x1 = Daru::Vector.new(%{a b b c})
|
8
|
+
# x2 = Daru::Vector.new(%{b a b c})
|
9
|
+
# x3 = Daru::Vector.new(%{a c b a})
|
10
|
+
# ds = Daru::DataFrame.new({:x1 => @x1, :x2 => @x2, :x3 => @x3})
|
11
|
+
# key={ :x1 => 'a',:x2 => 'b', :x3 => 'a'}
|
12
12
|
# ssa=Statsample::Reliability::SkillScaleAnalysis.new(ds,key)
|
13
13
|
# puts ssa.summary
|
14
14
|
class SkillScaleAnalysis
|
@@ -30,53 +30,59 @@ module Statsample
|
|
30
30
|
end
|
31
31
|
# Dataset only corrected vectors
|
32
32
|
def corrected_dataset_minimal
|
33
|
-
cds=corrected_dataset
|
34
|
-
dsm
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
33
|
+
cds = corrected_dataset
|
34
|
+
dsm = Daru::DataFrame.new(
|
35
|
+
@key.keys.inject({}) do |ac,v|
|
36
|
+
ac[v] = cds[v]
|
37
|
+
ac
|
38
|
+
end
|
39
|
+
)
|
39
40
|
|
40
|
-
dsm.
|
41
|
+
dsm.rename _("Corrected dataset from %s") % @ds.name
|
41
42
|
dsm
|
42
43
|
end
|
44
|
+
|
43
45
|
def vector_sum
|
44
46
|
corrected_dataset_minimal.vector_sum
|
45
47
|
end
|
48
|
+
|
46
49
|
def vector_mean
|
47
50
|
corrected_dataset_minimal.vector_mean
|
48
51
|
end
|
52
|
+
|
49
53
|
def scale_analysis
|
50
|
-
sa=ScaleAnalysis.new(corrected_dataset_minimal)
|
54
|
+
sa = ScaleAnalysis.new(corrected_dataset_minimal)
|
51
55
|
sa.name=_("%s (Scale Analysis)") % @name
|
52
56
|
sa
|
53
57
|
end
|
58
|
+
|
54
59
|
def corrected_dataset
|
55
60
|
if @cds.nil?
|
56
|
-
@cds
|
57
|
-
@
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
out[k]= @key[k]==v ? 1 : 0
|
61
|
+
@cds = Daru::DataFrame.new({}, order: @ds.vectors, name: @ds.name)
|
62
|
+
@ds.each_row do |row|
|
63
|
+
out = {}
|
64
|
+
row.each_with_index do |v, k|
|
65
|
+
if @key.has_key? k
|
66
|
+
if @ds[k].exists? v
|
67
|
+
out[k]= @key[k] == v ? 1 : 0
|
64
68
|
else
|
65
|
-
out[k]=nil
|
69
|
+
out[k] = nil
|
66
70
|
end
|
67
71
|
else
|
68
|
-
out[k]=v
|
72
|
+
out[k] = v
|
69
73
|
end
|
70
74
|
end
|
71
|
-
|
75
|
+
|
76
|
+
@cds.add_row(Daru::Vector.new(out))
|
72
77
|
end
|
73
|
-
@cds.
|
78
|
+
@cds.update
|
74
79
|
end
|
75
80
|
@cds
|
76
81
|
end
|
82
|
+
|
77
83
|
def report_building(builder)
|
78
84
|
builder.section(:name=>@name) do |s|
|
79
|
-
sa=scale_analysis
|
85
|
+
sa = scale_analysis
|
80
86
|
s.parse_element(sa)
|
81
87
|
if summary_show_problematic_items
|
82
88
|
s.section(:name=>_("Problematic Items")) do |spi|
|
@@ -91,17 +97,16 @@ module Statsample
|
|
91
97
|
|
92
98
|
spi.table(:name=>"Proportions",:header=>[_("Value"), _("%")]) do |table|
|
93
99
|
props.each do |k1,v|
|
94
|
-
table.row [ @ds[k].
|
100
|
+
table.row [ @ds[k].index_of(k1), "%0.3f" % v]
|
95
101
|
end
|
96
102
|
end
|
97
|
-
|
98
103
|
end
|
99
104
|
end
|
100
105
|
end
|
106
|
+
|
101
107
|
spi.text _("No problematic items") if count==0
|
102
108
|
end
|
103
109
|
end
|
104
|
-
|
105
110
|
end
|
106
111
|
end
|
107
112
|
end
|
data/lib/statsample/resample.rb
CHANGED
data/lib/statsample/shorthand.rb
CHANGED
@@ -11,30 +11,20 @@ module Statsample
|
|
11
11
|
###
|
12
12
|
# :section: R like methods
|
13
13
|
###
|
14
|
-
def read_with_cache(klass, filename,opts=Hash.new, cache=true)
|
15
|
-
file_ds=filename+".ds"
|
16
|
-
if cache and (File.exists? file_ds and File.mtime(file_ds)>File.mtime(filename))
|
17
|
-
ds=Statsample.load(file_ds)
|
18
|
-
else
|
19
|
-
ds=klass.read(filename)
|
20
|
-
ds.save(file_ds) if cache
|
21
|
-
end
|
22
|
-
ds
|
23
|
-
end
|
24
|
-
# Import an Excel file. Cache result by default
|
25
|
-
def read_excel(filename, opts=Hash.new, cache=true)
|
26
|
-
read_with_cache(Statsample::Excel, filename, opts, cache)
|
27
14
|
|
15
|
+
# Import an Excel file. Cache result by default
|
16
|
+
def read_excel(filename, opts=Hash.new)
|
17
|
+
Daru::DataFrame.from_excel filename, opts
|
28
18
|
end
|
29
|
-
# Import an CSV file. Cache result by default
|
30
19
|
|
31
|
-
|
32
|
-
|
20
|
+
# Import an CSV file. Cache result by default
|
21
|
+
def read_csv(filename, opts=Hash.new)
|
22
|
+
Daru::DataFrame.from_csv filename, opts
|
33
23
|
end
|
34
24
|
|
35
25
|
# Retrieve names (fields) from dataset
|
36
26
|
def names(ds)
|
37
|
-
ds.
|
27
|
+
ds.vectors.to_a
|
38
28
|
end
|
39
29
|
# Create a correlation matrix from a dataset
|
40
30
|
def cor(ds)
|
@@ -44,21 +34,25 @@ module Statsample
|
|
44
34
|
def cov(ds)
|
45
35
|
Statsample::Bivariate.covariate_matrix(ds)
|
46
36
|
end
|
47
|
-
# Create a
|
37
|
+
# Create a Daru::Vector
|
48
38
|
# Analog to R's c
|
49
39
|
def vector(*args)
|
50
|
-
|
40
|
+
Daru::Vector[*args]
|
51
41
|
end
|
52
42
|
# Random generation for the normal distribution
|
53
43
|
def rnorm(n,mean=0,sd=1)
|
54
44
|
rng=Distribution::Normal.rng(mean,sd)
|
55
|
-
|
45
|
+
Daru::Vector.new_with_size(n) { rng.call}
|
56
46
|
end
|
57
|
-
# Creates a new
|
58
|
-
# Each key is transformed into
|
47
|
+
# Creates a new Daru::DataFrame
|
48
|
+
# Each key is transformed into a Symbol wherever possible.
|
59
49
|
def dataset(vectors=Hash.new)
|
60
|
-
vectors=vectors.inject({})
|
61
|
-
|
50
|
+
vectors = vectors.inject({}) do |ac,v|
|
51
|
+
n = v[0].respond_to?(:to_sym) ? v[0].to_sym : v[0]
|
52
|
+
ac[n] = v[1]
|
53
|
+
ac
|
54
|
+
end
|
55
|
+
Daru::DataFrame.new(vectors)
|
62
56
|
end
|
63
57
|
alias :data_frame :dataset
|
64
58
|
# Returns a Statsample::Graph::Boxplot
|
@@ -78,13 +72,15 @@ module Statsample
|
|
78
72
|
def levene(*args)
|
79
73
|
Statsample::Test::Levene.new(*args)
|
80
74
|
end
|
75
|
+
|
81
76
|
def principal_axis(*args)
|
82
77
|
Statsample::Factor::PrincipalAxis.new(*args)
|
83
|
-
|
84
78
|
end
|
79
|
+
|
85
80
|
def polychoric(*args)
|
86
81
|
Statsample::Bivariate::Polychoric.new(*args)
|
87
82
|
end
|
83
|
+
|
88
84
|
def tetrachoric(*args)
|
89
85
|
Statsample::Bivariate::Tetrachoric.new(*args)
|
90
86
|
end
|
@@ -95,27 +91,35 @@ module Statsample
|
|
95
91
|
def lr(*args)
|
96
92
|
Statsample::Regression.multiple(*args)
|
97
93
|
end
|
94
|
+
|
98
95
|
def pca(ds,opts=Hash.new)
|
99
96
|
Statsample::Factor::PCA.new(ds,opts)
|
100
97
|
end
|
98
|
+
|
101
99
|
def dominance_analysis(*args)
|
102
100
|
Statsample::DominanceAnalysis.new(*args)
|
103
101
|
end
|
102
|
+
|
104
103
|
def dominance_analysis_bootstrap(*args)
|
105
104
|
Statsample::DominanceAnalysis::Bootstrap.new(*args)
|
106
105
|
end
|
106
|
+
|
107
107
|
def scale_analysis(*args)
|
108
108
|
Statsample::Reliability::ScaleAnalysis.new(*args)
|
109
109
|
end
|
110
|
+
|
110
111
|
def skill_scale_analysis(*args)
|
111
112
|
Statsample::Reliability::SkillScaleAnalysis.new(*args)
|
112
113
|
end
|
114
|
+
|
113
115
|
def multiscale_analysis(*args,&block)
|
114
116
|
Statsample::Reliability::MultiScaleAnalysis.new(*args,&block)
|
115
117
|
end
|
118
|
+
|
116
119
|
def test_u(*args)
|
117
120
|
Statsample::Test::UMannWhitney.new(*args)
|
118
121
|
end
|
122
|
+
|
119
123
|
module_function :test_u, :rnorm
|
120
124
|
end
|
121
125
|
end
|
@@ -22,6 +22,7 @@ module Statsample
|
|
22
22
|
end
|
23
23
|
calculate
|
24
24
|
end
|
25
|
+
|
25
26
|
def calculate
|
26
27
|
d=0
|
27
28
|
@d1.each {|x|
|
@@ -31,12 +32,13 @@ module Statsample
|
|
31
32
|
}
|
32
33
|
@d=d
|
33
34
|
end
|
35
|
+
|
34
36
|
# Make a wrapper EmpiricDistribution to any method which implements
|
35
|
-
# each
|
36
|
-
# On Statsample::Vector, only uses #valid_data
|
37
|
+
# each on Statsample::Vector, only uses non-missing data.
|
37
38
|
def make_cdf(v)
|
38
|
-
v.is_a?(
|
39
|
+
v.is_a?(Daru::Vector) ? EmpiricDistribution.new(v.only_valid.to_a) : EmpiricDistribution.new(v)
|
39
40
|
end
|
41
|
+
|
40
42
|
class EmpiricDistribution
|
41
43
|
def initialize(data)
|
42
44
|
@min=data.min
|
@@ -5,8 +5,8 @@ module Statsample
|
|
5
5
|
# <blockquote>Levene's test ( Levene, 1960) is used to test if k samples have equal variances. Equal variances across samples is called homogeneity of variance. Some statistical tests, for example the analysis of variance, assume that variances are equal across groups or samples. The Levene test can be used to verify that assumption.</blockquote>
|
6
6
|
# Use:
|
7
7
|
# require 'statsample'
|
8
|
-
# a=[1,2,3,4,5,6,7,8,100,10]
|
9
|
-
# b=[30,40,50,60,70,80,90,100,110,120]
|
8
|
+
# a = Daru::Vector.new([1,2,3,4,5,6,7,8,100,10])
|
9
|
+
# b = Daru::Vector.new([30,40,50,60,70,80,90,100,110,120])
|
10
10
|
#
|
11
11
|
# levene=Statsample::Test::Levene.new([a,b])
|
12
12
|
# puts levene.summary
|
@@ -29,10 +29,10 @@ module Statsample
|
|
29
29
|
attr_accessor :name
|
30
30
|
# Input could be an array of vectors or a dataset
|
31
31
|
def initialize(input, opts=Hash.new())
|
32
|
-
if input.is_a?
|
33
|
-
@vectors=input.
|
32
|
+
if input.is_a? Daru::DataFrame
|
33
|
+
@vectors = input.to_hash.values
|
34
34
|
else
|
35
|
-
@vectors=input
|
35
|
+
@vectors = input
|
36
36
|
end
|
37
37
|
@name=_("Levene Test")
|
38
38
|
opts.each{|k,v|
|
@@ -48,32 +48,34 @@ module Statsample
|
|
48
48
|
builder.text "%s : F(%d, %d) = %0.4f , p = %0.4f" % [@name, @d1, @d2, f, probability]
|
49
49
|
end
|
50
50
|
def compute
|
51
|
-
n=@vectors.inject(0) {|ac,v| ac+v.n_valid}
|
51
|
+
n=@vectors.inject(0) { |ac,v| ac + v.n_valid}
|
52
52
|
|
53
|
-
zi=@vectors.collect
|
53
|
+
zi=@vectors.collect do |vector|
|
54
54
|
mean=vector.mean
|
55
|
-
vector.collect {|v| (v-mean).abs }
|
56
|
-
|
55
|
+
Daru::Vector.new(vector.collect { |v| (v - mean).abs })
|
56
|
+
end
|
57
57
|
|
58
|
-
total_mean=
|
59
|
-
ac
|
60
|
-
|
58
|
+
total_mean = Daru::Vector.new(
|
59
|
+
zi.inject([]) do |ac,vector|
|
60
|
+
ac + vector.only_valid(:array)
|
61
|
+
end
|
62
|
+
).mean
|
61
63
|
|
62
|
-
k
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
}
|
64
|
+
k = @vectors.size
|
65
|
+
sum_num = zi.inject(0) do |ac,vector|
|
66
|
+
ac + (vector.size * (vector.mean - total_mean)**2)
|
67
|
+
end
|
67
68
|
|
68
|
-
sum_den=zi.inject(0)
|
69
|
-
z_mean=vector.mean
|
70
|
-
ac+vector.
|
71
|
-
acp+(zij-z_mean)**2
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
@
|
76
|
-
@
|
69
|
+
sum_den = zi.inject(0) do |ac,vector|
|
70
|
+
z_mean = vector.mean
|
71
|
+
ac + vector.only_valid(:array).inject(0) do |acp,zij|
|
72
|
+
acp + (zij - z_mean)**2
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
@w = ((n - k) * sum_num).quo((k - 1) * sum_den)
|
77
|
+
@d1 = k - 1
|
78
|
+
@d2 = n - k
|
77
79
|
end
|
78
80
|
private :compute
|
79
81
|
# Probability.
|
@@ -81,7 +83,6 @@ module Statsample
|
|
81
83
|
def probability
|
82
84
|
p_using_cdf(Distribution::F.cdf(f, @d1, @d2), :right)
|
83
85
|
end
|
84
|
-
|
85
86
|
end
|
86
87
|
end
|
87
88
|
end
|
data/lib/statsample/test/t.rb
CHANGED
@@ -1,10 +1,8 @@
|
|
1
1
|
module Statsample
|
2
2
|
module Test
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
# A t-test is any statistical hypothesis test in which the test statistic follows a Student's t distribution, if the null hypothesis is supported
|
3
|
+
# A t-test is any statistical hypothesis test in which the test
|
4
|
+
# statistic follows a Student's t distribution, if the null
|
5
|
+
# hypothesis is supported
|
8
6
|
class T
|
9
7
|
|
10
8
|
class << self
|
@@ -125,7 +123,7 @@ module Statsample
|
|
125
123
|
|
126
124
|
# One Sample t-test
|
127
125
|
# == Usage
|
128
|
-
# a=1000.times.map {rand(100)}
|
126
|
+
# a = Daru::Vector.new(1000.times.map {rand(100)})
|
129
127
|
# t_1=Statsample::Test::T::OneSample.new(a, {:u=>50})
|
130
128
|
# t_1.summary
|
131
129
|
#
|
@@ -196,8 +194,8 @@ module Statsample
|
|
196
194
|
# Two Sample t-test.
|
197
195
|
#
|
198
196
|
# == Usage
|
199
|
-
# a=1000.times.map {rand(100)}
|
200
|
-
# b=1000.times.map {rand(100)}
|
197
|
+
# a = Daru::Vector.new(1000.times.map {rand(100)})
|
198
|
+
# b = Daru::Vector.new(1000.times.map {rand(100)})
|
201
199
|
# t_2=Statsample::Test::T::TwoSamplesIndependent.new(a,b)
|
202
200
|
# t_2.summary
|
203
201
|
# === Output
|
@@ -290,7 +288,7 @@ module Statsample
|
|
290
288
|
def report_building(b) # :nodoc:
|
291
289
|
b.section(:name=>@name) {|g|
|
292
290
|
g.table(:name=>_("Mean and standard deviation"), :header=>[_("Variable"), _("mean"), _("sd"),_("n")]) {|t|
|
293
|
-
t.row([@v1.name,"%0.4f" % @v1.mean,"%0.4f" % @v1.sd
|
291
|
+
t.row([@v1.name,"%0.4f" % @v1.mean,"%0.4f" % @v1.sd, @v1.n_valid])
|
294
292
|
t.row([@v2.name,"%0.4f" % @v2.mean,"%0.4f" % @v2.sd, @v2.n_valid])
|
295
293
|
}
|
296
294
|
g.parse_element(Statsample::Test.levene([@v1,@v2],:name=>_("Levene test for equality of variances")))
|
@@ -113,36 +113,36 @@ module Statsample
|
|
113
113
|
include Summarizable
|
114
114
|
#
|
115
115
|
# Create a new U Mann-Whitney test
|
116
|
-
# Params: Two
|
116
|
+
# Params: Two Daru::Vectors
|
117
117
|
#
|
118
118
|
def initialize(v1,v2, opts=Hash.new)
|
119
|
-
@v1=v1
|
120
|
-
@v2=v2
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
@
|
127
|
-
|
128
|
-
|
129
|
-
|
119
|
+
@v1 = v1
|
120
|
+
@v2 = v2
|
121
|
+
v1_valid = v1.only_valid.reset_index!
|
122
|
+
v2_valid = v2.only_valid.reset_index!
|
123
|
+
@n1 = v1_valid.size
|
124
|
+
@n2 = v2_valid.size
|
125
|
+
data = Daru::Vector.new(v1_valid.to_a + v2_valid.to_a)
|
126
|
+
groups = Daru::Vector.new(([0] * @n1) + ([1] * @n2))
|
127
|
+
ds = Daru::DataFrame.new({:g => groups, :data => data})
|
128
|
+
@t = nil
|
129
|
+
@ties = data.to_a.size != data.to_a.uniq.size
|
130
|
+
if @ties
|
131
|
+
adjust_for_ties(ds[:data])
|
130
132
|
end
|
131
|
-
ds[
|
132
|
-
|
133
|
-
@n=ds.cases
|
133
|
+
ds[:ranked] = ds[:data].ranked
|
134
|
+
@n = ds.nrows
|
134
135
|
|
135
|
-
@r1=ds.
|
136
|
-
@r2=((ds.
|
137
|
-
@u1=r1-((@n1*(@n1+1)).quo(2))
|
138
|
-
@u2=r2-((@n2*(@n2+1)).quo(2))
|
139
|
-
@u=(u1<u2) ? u1 : u2
|
140
|
-
opts_default={:name=>_("Mann-Whitney's U")}
|
141
|
-
@opts=opts_default.merge(opts)
|
136
|
+
@r1 = ds.filter_rows { |r| r[:g] == 0}[:ranked].sum
|
137
|
+
@r2 = ((ds.nrows * (ds.nrows + 1)).quo(2)) - r1
|
138
|
+
@u1 = r1 - ((@n1 * (@n1 + 1)).quo(2))
|
139
|
+
@u2 = r2 - ((@n2 * (@n2 + 1)).quo(2))
|
140
|
+
@u = (u1 < u2) ? u1 : u2
|
141
|
+
opts_default = { :name=>_("Mann-Whitney's U") }
|
142
|
+
@opts = opts_default.merge(opts)
|
142
143
|
opts_default.keys.each {|k|
|
143
144
|
send("#{k}=", @opts[k])
|
144
|
-
}
|
145
|
-
|
145
|
+
}
|
146
146
|
end
|
147
147
|
def report_building(generator) # :nodoc:
|
148
148
|
generator.section(:name=>@name) do |s|
|
@@ -160,8 +160,8 @@ module Statsample
|
|
160
160
|
# Exact probability of finding values of U lower or equal to sample on U distribution. Use with caution with m*n>100000.
|
161
161
|
# Uses u_sampling_distribution_as62
|
162
162
|
def probability_exact
|
163
|
-
dist=UMannWhitney.u_sampling_distribution_as62(@n1,@n2)
|
164
|
-
sum=0
|
163
|
+
dist = UMannWhitney.u_sampling_distribution_as62(@n1,@n2)
|
164
|
+
sum = 0
|
165
165
|
(0..@u.to_i).each {|i|
|
166
166
|
sum+=dist[i]
|
167
167
|
}
|
@@ -172,8 +172,8 @@ module Statsample
|
|
172
172
|
# == Reference:
|
173
173
|
# * http://europe.isixsigma.com/library/content/c080806a.asp
|
174
174
|
def adjust_for_ties(data)
|
175
|
-
@t=data.frequencies.find_all{|k,v| v>1}.inject(0) {|a,v|
|
176
|
-
a+(v[1]**3-v[1]).quo(12)
|
175
|
+
@t = data.frequencies.find_all { |k,v| v > 1 }.inject(0) { |a,v|
|
176
|
+
a + (v[1]**3 - v[1]).quo(12)
|
177
177
|
}
|
178
178
|
end
|
179
179
|
|