statsample 1.5.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (114) hide show
  1. checksums.yaml +4 -4
  2. data/.build.sh +15 -0
  3. data/.gitignore +1 -0
  4. data/.travis.yml +19 -7
  5. data/CONTRIBUTING.md +33 -0
  6. data/History.txt +5 -0
  7. data/README.md +41 -53
  8. data/benchmarks/correlation_matrix_15_variables.rb +6 -5
  9. data/benchmarks/correlation_matrix_5_variables.rb +6 -5
  10. data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +23 -26
  11. data/examples/boxplot.rb +17 -5
  12. data/examples/correlation_matrix.rb +36 -7
  13. data/examples/dataset.rb +25 -5
  14. data/examples/dominance_analysis.rb +8 -7
  15. data/examples/dominance_analysis_bootstrap.rb +16 -11
  16. data/examples/histogram.rb +16 -2
  17. data/examples/icc.rb +5 -6
  18. data/examples/levene.rb +17 -3
  19. data/examples/multiple_regression.rb +6 -3
  20. data/examples/parallel_analysis.rb +11 -6
  21. data/examples/polychoric.rb +26 -13
  22. data/examples/principal_axis.rb +8 -4
  23. data/examples/reliability.rb +10 -10
  24. data/examples/scatterplot.rb +8 -0
  25. data/examples/t_test.rb +7 -0
  26. data/examples/u_test.rb +10 -2
  27. data/examples/vector.rb +9 -6
  28. data/examples/velicer_map_test.rb +12 -8
  29. data/lib/statsample.rb +13 -47
  30. data/lib/statsample/analysis/suite.rb +1 -1
  31. data/lib/statsample/anova/oneway.rb +6 -6
  32. data/lib/statsample/anova/twoway.rb +26 -24
  33. data/lib/statsample/bivariate.rb +78 -61
  34. data/lib/statsample/bivariate/pearson.rb +2 -2
  35. data/lib/statsample/codification.rb +45 -32
  36. data/lib/statsample/converter/csv.rb +15 -53
  37. data/lib/statsample/converter/spss.rb +6 -5
  38. data/lib/statsample/converters.rb +50 -211
  39. data/lib/statsample/crosstab.rb +26 -25
  40. data/lib/statsample/daru.rb +117 -0
  41. data/lib/statsample/dataset.rb +70 -942
  42. data/lib/statsample/dominanceanalysis.rb +16 -17
  43. data/lib/statsample/dominanceanalysis/bootstrap.rb +26 -28
  44. data/lib/statsample/factor/parallelanalysis.rb +17 -19
  45. data/lib/statsample/factor/pca.rb +21 -20
  46. data/lib/statsample/factor/principalaxis.rb +3 -3
  47. data/lib/statsample/graph/boxplot.rb +8 -16
  48. data/lib/statsample/graph/histogram.rb +4 -4
  49. data/lib/statsample/graph/scatterplot.rb +8 -7
  50. data/lib/statsample/histogram.rb +128 -119
  51. data/lib/statsample/matrix.rb +20 -16
  52. data/lib/statsample/multiset.rb +39 -38
  53. data/lib/statsample/regression.rb +3 -3
  54. data/lib/statsample/regression/multiple.rb +8 -10
  55. data/lib/statsample/regression/multiple/alglibengine.rb +96 -89
  56. data/lib/statsample/regression/multiple/baseengine.rb +32 -32
  57. data/lib/statsample/regression/multiple/gslengine.rb +33 -36
  58. data/lib/statsample/regression/multiple/matrixengine.rb +7 -9
  59. data/lib/statsample/regression/multiple/rubyengine.rb +39 -41
  60. data/lib/statsample/reliability.rb +23 -25
  61. data/lib/statsample/reliability/icc.rb +8 -7
  62. data/lib/statsample/reliability/multiscaleanalysis.rb +14 -12
  63. data/lib/statsample/reliability/scaleanalysis.rb +58 -60
  64. data/lib/statsample/reliability/skillscaleanalysis.rb +34 -29
  65. data/lib/statsample/resample.rb +1 -1
  66. data/lib/statsample/shorthand.rb +29 -25
  67. data/lib/statsample/test/kolmogorovsmirnov.rb +5 -3
  68. data/lib/statsample/test/levene.rb +28 -27
  69. data/lib/statsample/test/t.rb +7 -9
  70. data/lib/statsample/test/umannwhitney.rb +28 -28
  71. data/lib/statsample/test/wilcoxonsignedrank.rb +45 -43
  72. data/lib/statsample/vector.rb +70 -1013
  73. data/lib/statsample/version.rb +1 -1
  74. data/statsample.gemspec +12 -16
  75. data/test/helpers_tests.rb +1 -1
  76. data/test/test_analysis.rb +17 -17
  77. data/test/test_anova_contrast.rb +6 -6
  78. data/test/test_anovatwowaywithdataset.rb +8 -8
  79. data/test/test_anovawithvectors.rb +8 -8
  80. data/test/test_awesome_print_bug.rb +1 -1
  81. data/test/test_bartlettsphericity.rb +4 -4
  82. data/test/test_bivariate.rb +48 -43
  83. data/test/test_codification.rb +33 -33
  84. data/test/test_crosstab.rb +9 -9
  85. data/test/test_dataset.rb +28 -458
  86. data/test/test_factor.rb +46 -38
  87. data/test/test_factor_pa.rb +22 -13
  88. data/test/test_ggobi.rb +4 -4
  89. data/test/test_gsl.rb +4 -4
  90. data/test/test_histogram.rb +3 -3
  91. data/test/test_matrix.rb +13 -13
  92. data/test/test_multiset.rb +103 -91
  93. data/test/test_regression.rb +57 -52
  94. data/test/test_reliability.rb +55 -45
  95. data/test/test_reliability_icc.rb +8 -8
  96. data/test/test_reliability_skillscale.rb +26 -24
  97. data/test/test_resample.rb +1 -1
  98. data/test/test_statistics.rb +3 -13
  99. data/test/test_stest.rb +9 -9
  100. data/test/test_stratified.rb +3 -3
  101. data/test/test_test_t.rb +12 -12
  102. data/test/test_umannwhitney.rb +2 -2
  103. data/test/test_vector.rb +76 -613
  104. data/test/test_wilcoxonsignedrank.rb +4 -4
  105. metadata +57 -28
  106. data/lib/statsample/rserve_extension.rb +0 -20
  107. data/lib/statsample/vector/gsl.rb +0 -106
  108. data/test/fixtures/repeated_fields.csv +0 -7
  109. data/test/fixtures/scientific_notation.csv +0 -4
  110. data/test/fixtures/test_csv.csv +0 -7
  111. data/test/fixtures/test_xls.xls +0 -0
  112. data/test/test_csv.rb +0 -63
  113. data/test/test_rserve_extension.rb +0 -42
  114. data/test/test_xls.rb +0 -52
@@ -4,11 +4,11 @@ module Statsample
4
4
  # Given a dataset with results and a correct answers hash,
5
5
  # generates a ScaleAnalysis
6
6
  # == Usage
7
- # x1=%{a b b c}.to_vector
8
- # x2=%{b a b c}.to_vector
9
- # x3=%{a c b a}.to_vector
10
- # ds={'x1'=>@x1,'x2'=>@x2,'x3'=>@x3}.to_dataset
11
- # key={'x1'=>'a','x2'=>'b','x3'=>'a'}
7
+ # x1 = Daru::Vector.new(%{a b b c})
8
+ # x2 = Daru::Vector.new(%{b a b c})
9
+ # x3 = Daru::Vector.new(%{a c b a})
10
+ # ds = Daru::DataFrame.new({:x1 => @x1, :x2 => @x2, :x3 => @x3})
11
+ # key={ :x1 => 'a',:x2 => 'b', :x3 => 'a'}
12
12
  # ssa=Statsample::Reliability::SkillScaleAnalysis.new(ds,key)
13
13
  # puts ssa.summary
14
14
  class SkillScaleAnalysis
@@ -30,53 +30,59 @@ module Statsample
30
30
  end
31
31
  # Dataset only corrected vectors
32
32
  def corrected_dataset_minimal
33
- cds=corrected_dataset
34
- dsm=@key.keys.inject({}) {|ac,v| ac[v]=cds[v];ac}.to_dataset
35
- @key.keys.each do |k|
36
- dsm[k].name=_("%s(corrected)") % @ds[k].name
37
- dsm[k].labels=@ds[k].labels
38
- end
33
+ cds = corrected_dataset
34
+ dsm = Daru::DataFrame.new(
35
+ @key.keys.inject({}) do |ac,v|
36
+ ac[v] = cds[v]
37
+ ac
38
+ end
39
+ )
39
40
 
40
- dsm.name=_("Corrected dataset from %s") % @ds.name
41
+ dsm.rename _("Corrected dataset from %s") % @ds.name
41
42
  dsm
42
43
  end
44
+
43
45
  def vector_sum
44
46
  corrected_dataset_minimal.vector_sum
45
47
  end
48
+
46
49
  def vector_mean
47
50
  corrected_dataset_minimal.vector_mean
48
51
  end
52
+
49
53
  def scale_analysis
50
- sa=ScaleAnalysis.new(corrected_dataset_minimal)
54
+ sa = ScaleAnalysis.new(corrected_dataset_minimal)
51
55
  sa.name=_("%s (Scale Analysis)") % @name
52
56
  sa
53
57
  end
58
+
54
59
  def corrected_dataset
55
60
  if @cds.nil?
56
- @cds=@ds.dup_empty
57
- @key.keys.each {|k| @cds[k].type=:numeric; @cds[k].name=@ds[k].name}
58
- @ds.each do |row|
59
- out={}
60
- row.each do |k,v|
61
- if @key.keys.include? k
62
- if @ds[k].is_valid? v
63
- out[k]= @key[k]==v ? 1 : 0
61
+ @cds = Daru::DataFrame.new({}, order: @ds.vectors, name: @ds.name)
62
+ @ds.each_row do |row|
63
+ out = {}
64
+ row.each_with_index do |v, k|
65
+ if @key.has_key? k
66
+ if @ds[k].exists? v
67
+ out[k]= @key[k] == v ? 1 : 0
64
68
  else
65
- out[k]=nil
69
+ out[k] = nil
66
70
  end
67
71
  else
68
- out[k]=v
72
+ out[k] = v
69
73
  end
70
74
  end
71
- @cds.add_case(out,false)
75
+
76
+ @cds.add_row(Daru::Vector.new(out))
72
77
  end
73
- @cds.update_valid_data
78
+ @cds.update
74
79
  end
75
80
  @cds
76
81
  end
82
+
77
83
  def report_building(builder)
78
84
  builder.section(:name=>@name) do |s|
79
- sa=scale_analysis
85
+ sa = scale_analysis
80
86
  s.parse_element(sa)
81
87
  if summary_show_problematic_items
82
88
  s.section(:name=>_("Problematic Items")) do |spi|
@@ -91,17 +97,16 @@ module Statsample
91
97
 
92
98
  spi.table(:name=>"Proportions",:header=>[_("Value"), _("%")]) do |table|
93
99
  props.each do |k1,v|
94
- table.row [ @ds[k].labeling(k1), "%0.3f" % v]
100
+ table.row [ @ds[k].index_of(k1), "%0.3f" % v]
95
101
  end
96
102
  end
97
-
98
103
  end
99
104
  end
100
105
  end
106
+
101
107
  spi.text _("No problematic items") if count==0
102
108
  end
103
109
  end
104
-
105
110
  end
106
111
  end
107
112
  end
@@ -7,7 +7,7 @@ module Statsample
7
7
 
8
8
  def generate (size,low,upper)
9
9
  range=upper-low+1
10
- Vector.new((0...size).collect {|x| rand(range)+low },:numeric)
10
+ Daru::Vector.new((0...size).collect {|x| rand(range)+low })
11
11
  end
12
12
 
13
13
  end
@@ -11,30 +11,20 @@ module Statsample
11
11
  ###
12
12
  # :section: R like methods
13
13
  ###
14
- def read_with_cache(klass, filename,opts=Hash.new, cache=true)
15
- file_ds=filename+".ds"
16
- if cache and (File.exists? file_ds and File.mtime(file_ds)>File.mtime(filename))
17
- ds=Statsample.load(file_ds)
18
- else
19
- ds=klass.read(filename)
20
- ds.save(file_ds) if cache
21
- end
22
- ds
23
- end
24
- # Import an Excel file. Cache result by default
25
- def read_excel(filename, opts=Hash.new, cache=true)
26
- read_with_cache(Statsample::Excel, filename, opts, cache)
27
14
 
15
+ # Import an Excel file. Cache result by default
16
+ def read_excel(filename, opts=Hash.new)
17
+ Daru::DataFrame.from_excel filename, opts
28
18
  end
29
- # Import an CSV file. Cache result by default
30
19
 
31
- def read_csv
32
- read_with_cache(Statsample::CSV, filename, opts, cache)
20
+ # Import an CSV file. Cache result by default
21
+ def read_csv(filename, opts=Hash.new)
22
+ Daru::DataFrame.from_csv filename, opts
33
23
  end
34
24
 
35
25
  # Retrieve names (fields) from dataset
36
26
  def names(ds)
37
- ds.fields
27
+ ds.vectors.to_a
38
28
  end
39
29
  # Create a correlation matrix from a dataset
40
30
  def cor(ds)
@@ -44,21 +34,25 @@ module Statsample
44
34
  def cov(ds)
45
35
  Statsample::Bivariate.covariate_matrix(ds)
46
36
  end
47
- # Create a Statsample::Vector
37
+ # Create a Daru::Vector
48
38
  # Analog to R's c
49
39
  def vector(*args)
50
- Statsample::Vector[*args]
40
+ Daru::Vector[*args]
51
41
  end
52
42
  # Random generation for the normal distribution
53
43
  def rnorm(n,mean=0,sd=1)
54
44
  rng=Distribution::Normal.rng(mean,sd)
55
- Statsample::Vector.new_numeric(n) { rng.call}
45
+ Daru::Vector.new_with_size(n) { rng.call}
56
46
  end
57
- # Creates a new Statsample::Dataset
58
- # Each key is transformed into string
47
+ # Creates a new Daru::DataFrame
48
+ # Each key is transformed into a Symbol wherever possible.
59
49
  def dataset(vectors=Hash.new)
60
- vectors=vectors.inject({}) {|ac,v| ac[v[0].to_s]=v[1];ac}
61
- Statsample::Dataset.new(vectors)
50
+ vectors = vectors.inject({}) do |ac,v|
51
+ n = v[0].respond_to?(:to_sym) ? v[0].to_sym : v[0]
52
+ ac[n] = v[1]
53
+ ac
54
+ end
55
+ Daru::DataFrame.new(vectors)
62
56
  end
63
57
  alias :data_frame :dataset
64
58
  # Returns a Statsample::Graph::Boxplot
@@ -78,13 +72,15 @@ module Statsample
78
72
  def levene(*args)
79
73
  Statsample::Test::Levene.new(*args)
80
74
  end
75
+
81
76
  def principal_axis(*args)
82
77
  Statsample::Factor::PrincipalAxis.new(*args)
83
-
84
78
  end
79
+
85
80
  def polychoric(*args)
86
81
  Statsample::Bivariate::Polychoric.new(*args)
87
82
  end
83
+
88
84
  def tetrachoric(*args)
89
85
  Statsample::Bivariate::Tetrachoric.new(*args)
90
86
  end
@@ -95,27 +91,35 @@ module Statsample
95
91
  def lr(*args)
96
92
  Statsample::Regression.multiple(*args)
97
93
  end
94
+
98
95
  def pca(ds,opts=Hash.new)
99
96
  Statsample::Factor::PCA.new(ds,opts)
100
97
  end
98
+
101
99
  def dominance_analysis(*args)
102
100
  Statsample::DominanceAnalysis.new(*args)
103
101
  end
102
+
104
103
  def dominance_analysis_bootstrap(*args)
105
104
  Statsample::DominanceAnalysis::Bootstrap.new(*args)
106
105
  end
106
+
107
107
  def scale_analysis(*args)
108
108
  Statsample::Reliability::ScaleAnalysis.new(*args)
109
109
  end
110
+
110
111
  def skill_scale_analysis(*args)
111
112
  Statsample::Reliability::SkillScaleAnalysis.new(*args)
112
113
  end
114
+
113
115
  def multiscale_analysis(*args,&block)
114
116
  Statsample::Reliability::MultiScaleAnalysis.new(*args,&block)
115
117
  end
118
+
116
119
  def test_u(*args)
117
120
  Statsample::Test::UMannWhitney.new(*args)
118
121
  end
122
+
119
123
  module_function :test_u, :rnorm
120
124
  end
121
125
  end
@@ -22,6 +22,7 @@ module Statsample
22
22
  end
23
23
  calculate
24
24
  end
25
+
25
26
  def calculate
26
27
  d=0
27
28
  @d1.each {|x|
@@ -31,12 +32,13 @@ module Statsample
31
32
  }
32
33
  @d=d
33
34
  end
35
+
34
36
  # Make a wrapper EmpiricDistribution to any method which implements
35
- # each
36
- # On Statsample::Vector, only uses #valid_data
37
+ # each on Statsample::Vector, only uses non-missing data.
37
38
  def make_cdf(v)
38
- v.is_a?(Statsample::Vector) ? EmpiricDistribution.new(v.valid_data) : EmpiricDistribution.new(v)
39
+ v.is_a?(Daru::Vector) ? EmpiricDistribution.new(v.only_valid.to_a) : EmpiricDistribution.new(v)
39
40
  end
41
+
40
42
  class EmpiricDistribution
41
43
  def initialize(data)
42
44
  @min=data.min
@@ -5,8 +5,8 @@ module Statsample
5
5
  # <blockquote>Levene's test ( Levene, 1960) is used to test if k samples have equal variances. Equal variances across samples is called homogeneity of variance. Some statistical tests, for example the analysis of variance, assume that variances are equal across groups or samples. The Levene test can be used to verify that assumption.</blockquote>
6
6
  # Use:
7
7
  # require 'statsample'
8
- # a=[1,2,3,4,5,6,7,8,100,10].to_numeric
9
- # b=[30,40,50,60,70,80,90,100,110,120].to_numeric
8
+ # a = Daru::Vector.new([1,2,3,4,5,6,7,8,100,10])
9
+ # b = Daru::Vector.new([30,40,50,60,70,80,90,100,110,120])
10
10
  #
11
11
  # levene=Statsample::Test::Levene.new([a,b])
12
12
  # puts levene.summary
@@ -29,10 +29,10 @@ module Statsample
29
29
  attr_accessor :name
30
30
  # Input could be an array of vectors or a dataset
31
31
  def initialize(input, opts=Hash.new())
32
- if input.is_a? Statsample::Dataset
33
- @vectors=input.vectors.values
32
+ if input.is_a? Daru::DataFrame
33
+ @vectors = input.to_hash.values
34
34
  else
35
- @vectors=input
35
+ @vectors = input
36
36
  end
37
37
  @name=_("Levene Test")
38
38
  opts.each{|k,v|
@@ -48,32 +48,34 @@ module Statsample
48
48
  builder.text "%s : F(%d, %d) = %0.4f , p = %0.4f" % [@name, @d1, @d2, f, probability]
49
49
  end
50
50
  def compute
51
- n=@vectors.inject(0) {|ac,v| ac+v.n_valid}
51
+ n=@vectors.inject(0) { |ac,v| ac + v.n_valid}
52
52
 
53
- zi=@vectors.collect {|vector|
53
+ zi=@vectors.collect do |vector|
54
54
  mean=vector.mean
55
- vector.collect {|v| (v-mean).abs }.to_numeric
56
- }
55
+ Daru::Vector.new(vector.collect { |v| (v - mean).abs })
56
+ end
57
57
 
58
- total_mean=zi.inject([]) {|ac,vector|
59
- ac+vector.valid_data
60
- }.to_numeric.mean
58
+ total_mean = Daru::Vector.new(
59
+ zi.inject([]) do |ac,vector|
60
+ ac + vector.only_valid(:array)
61
+ end
62
+ ).mean
61
63
 
62
- k=@vectors.size
63
-
64
- sum_num=zi.inject(0) {|ac,vector|
65
- ac+(vector.size*(vector.mean-total_mean)**2)
66
- }
64
+ k = @vectors.size
65
+ sum_num = zi.inject(0) do |ac,vector|
66
+ ac + (vector.size * (vector.mean - total_mean)**2)
67
+ end
67
68
 
68
- sum_den=zi.inject(0) {|ac,vector|
69
- z_mean=vector.mean
70
- ac+vector.valid_data.inject(0) {|acp,zij|
71
- acp+(zij-z_mean)**2
72
- }
73
- }
74
- @w=((n-k)*sum_num).quo((k-1)*sum_den)
75
- @d1=k-1
76
- @d2=n-k
69
+ sum_den = zi.inject(0) do |ac,vector|
70
+ z_mean = vector.mean
71
+ ac + vector.only_valid(:array).inject(0) do |acp,zij|
72
+ acp + (zij - z_mean)**2
73
+ end
74
+ end
75
+
76
+ @w = ((n - k) * sum_num).quo((k - 1) * sum_den)
77
+ @d1 = k - 1
78
+ @d2 = n - k
77
79
  end
78
80
  private :compute
79
81
  # Probability.
@@ -81,7 +83,6 @@ module Statsample
81
83
  def probability
82
84
  p_using_cdf(Distribution::F.cdf(f, @d1, @d2), :right)
83
85
  end
84
-
85
86
  end
86
87
  end
87
88
  end
@@ -1,10 +1,8 @@
1
1
  module Statsample
2
2
  module Test
3
-
4
-
5
-
6
-
7
- # A t-test is any statistical hypothesis test in which the test statistic follows a Student's t distribution, if the null hypothesis is supported
3
+ # A t-test is any statistical hypothesis test in which the test
4
+ # statistic follows a Student's t distribution, if the null
5
+ # hypothesis is supported
8
6
  class T
9
7
 
10
8
  class << self
@@ -125,7 +123,7 @@ module Statsample
125
123
 
126
124
  # One Sample t-test
127
125
  # == Usage
128
- # a=1000.times.map {rand(100)}.to_numeric
126
+ # a = Daru::Vector.new(1000.times.map {rand(100)})
129
127
  # t_1=Statsample::Test::T::OneSample.new(a, {:u=>50})
130
128
  # t_1.summary
131
129
  #
@@ -196,8 +194,8 @@ module Statsample
196
194
  # Two Sample t-test.
197
195
  #
198
196
  # == Usage
199
- # a=1000.times.map {rand(100)}.to_numeric
200
- # b=1000.times.map {rand(100)}.to_numeric
197
+ # a = Daru::Vector.new(1000.times.map {rand(100)})
198
+ # b = Daru::Vector.new(1000.times.map {rand(100)})
201
199
  # t_2=Statsample::Test::T::TwoSamplesIndependent.new(a,b)
202
200
  # t_2.summary
203
201
  # === Output
@@ -290,7 +288,7 @@ module Statsample
290
288
  def report_building(b) # :nodoc:
291
289
  b.section(:name=>@name) {|g|
292
290
  g.table(:name=>_("Mean and standard deviation"), :header=>[_("Variable"), _("mean"), _("sd"),_("n")]) {|t|
293
- t.row([@v1.name,"%0.4f" % @v1.mean,"%0.4f" % @v1.sd,@v1.n_valid])
291
+ t.row([@v1.name,"%0.4f" % @v1.mean,"%0.4f" % @v1.sd, @v1.n_valid])
294
292
  t.row([@v2.name,"%0.4f" % @v2.mean,"%0.4f" % @v2.sd, @v2.n_valid])
295
293
  }
296
294
  g.parse_element(Statsample::Test.levene([@v1,@v2],:name=>_("Levene test for equality of variances")))
@@ -113,36 +113,36 @@ module Statsample
113
113
  include Summarizable
114
114
  #
115
115
  # Create a new U Mann-Whitney test
116
- # Params: Two Statsample::Vectors
116
+ # Params: Two Daru::Vectors
117
117
  #
118
118
  def initialize(v1,v2, opts=Hash.new)
119
- @v1=v1
120
- @v2=v2
121
- @n1=v1.valid_data.size
122
- @n2=v2.valid_data.size
123
- data=(v1.valid_data+v2.valid_data).to_numeric
124
- groups=(([0]*@n1)+([1]*@n2)).to_vector
125
- ds={'g'=>groups, 'data'=>data}.to_dataset
126
- @t=nil
127
- @ties=data.data.size!=data.data.uniq.size
128
- if(@ties)
129
- adjust_for_ties(ds['data'])
119
+ @v1 = v1
120
+ @v2 = v2
121
+ v1_valid = v1.only_valid.reset_index!
122
+ v2_valid = v2.only_valid.reset_index!
123
+ @n1 = v1_valid.size
124
+ @n2 = v2_valid.size
125
+ data = Daru::Vector.new(v1_valid.to_a + v2_valid.to_a)
126
+ groups = Daru::Vector.new(([0] * @n1) + ([1] * @n2))
127
+ ds = Daru::DataFrame.new({:g => groups, :data => data})
128
+ @t = nil
129
+ @ties = data.to_a.size != data.to_a.uniq.size
130
+ if @ties
131
+ adjust_for_ties(ds[:data])
130
132
  end
131
- ds['ranked']=ds['data'].ranked(:numeric)
132
-
133
- @n=ds.cases
133
+ ds[:ranked] = ds[:data].ranked
134
+ @n = ds.nrows
134
135
 
135
- @r1=ds.filter{|r| r['g']==0}['ranked'].sum
136
- @r2=((ds.cases*(ds.cases+1)).quo(2))-r1
137
- @u1=r1-((@n1*(@n1+1)).quo(2))
138
- @u2=r2-((@n2*(@n2+1)).quo(2))
139
- @u=(u1<u2) ? u1 : u2
140
- opts_default={:name=>_("Mann-Whitney's U")}
141
- @opts=opts_default.merge(opts)
136
+ @r1 = ds.filter_rows { |r| r[:g] == 0}[:ranked].sum
137
+ @r2 = ((ds.nrows * (ds.nrows + 1)).quo(2)) - r1
138
+ @u1 = r1 - ((@n1 * (@n1 + 1)).quo(2))
139
+ @u2 = r2 - ((@n2 * (@n2 + 1)).quo(2))
140
+ @u = (u1 < u2) ? u1 : u2
141
+ opts_default = { :name=>_("Mann-Whitney's U") }
142
+ @opts = opts_default.merge(opts)
142
143
  opts_default.keys.each {|k|
143
144
  send("#{k}=", @opts[k])
144
- }
145
-
145
+ }
146
146
  end
147
147
  def report_building(generator) # :nodoc:
148
148
  generator.section(:name=>@name) do |s|
@@ -160,8 +160,8 @@ module Statsample
160
160
  # Exact probability of finding values of U lower or equal to sample on U distribution. Use with caution with m*n>100000.
161
161
  # Uses u_sampling_distribution_as62
162
162
  def probability_exact
163
- dist=UMannWhitney.u_sampling_distribution_as62(@n1,@n2)
164
- sum=0
163
+ dist = UMannWhitney.u_sampling_distribution_as62(@n1,@n2)
164
+ sum = 0
165
165
  (0..@u.to_i).each {|i|
166
166
  sum+=dist[i]
167
167
  }
@@ -172,8 +172,8 @@ module Statsample
172
172
  # == Reference:
173
173
  # * http://europe.isixsigma.com/library/content/c080806a.asp
174
174
  def adjust_for_ties(data)
175
- @t=data.frequencies.find_all{|k,v| v>1}.inject(0) {|a,v|
176
- a+(v[1]**3-v[1]).quo(12)
175
+ @t = data.frequencies.find_all { |k,v| v > 1 }.inject(0) { |a,v|
176
+ a + (v[1]**3 - v[1]).quo(12)
177
177
  }
178
178
  end
179
179