statsample 0.18.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (121) hide show
  1. data.tar.gz.sig +0 -0
  2. data/History.txt +23 -0
  3. data/Manifest.txt +28 -17
  4. data/Rakefile +3 -2
  5. data/benchmarks/correlation_matrix_15_variables.rb +31 -0
  6. data/benchmarks/correlation_matrix_5_variables.rb +32 -0
  7. data/benchmarks/correlation_matrix_methods/correlation_matrix.ds +0 -0
  8. data/benchmarks/correlation_matrix_methods/correlation_matrix.html +93 -0
  9. data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +75 -0
  10. data/benchmarks/correlation_matrix_methods/correlation_matrix.xls +0 -0
  11. data/benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods +0 -0
  12. data/benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods +0 -0
  13. data/benchmarks/correlation_matrix_methods/results.ds +0 -0
  14. data/benchmarks/factor_map.rb +37 -0
  15. data/benchmarks/helpers_benchmark.rb +5 -0
  16. data/examples/boxplot.rb +13 -14
  17. data/examples/correlation_matrix.rb +16 -8
  18. data/examples/dataset.rb +13 -4
  19. data/examples/dominance_analysis.rb +23 -17
  20. data/examples/dominance_analysis_bootstrap.rb +28 -22
  21. data/examples/histogram.rb +8 -9
  22. data/examples/icc.rb +20 -21
  23. data/examples/levene.rb +10 -4
  24. data/examples/multiple_regression.rb +9 -28
  25. data/examples/multivariate_correlation.rb +9 -3
  26. data/examples/parallel_analysis.rb +20 -16
  27. data/examples/polychoric.rb +15 -9
  28. data/examples/principal_axis.rb +18 -6
  29. data/examples/reliability.rb +26 -13
  30. data/examples/scatterplot.rb +10 -6
  31. data/examples/t_test.rb +15 -6
  32. data/examples/tetrachoric.rb +9 -2
  33. data/examples/u_test.rb +12 -4
  34. data/examples/vector.rb +13 -2
  35. data/examples/velicer_map_test.rb +33 -26
  36. data/lib/statsample.rb +32 -12
  37. data/lib/statsample/analysis.rb +79 -0
  38. data/lib/statsample/analysis/suite.rb +72 -0
  39. data/lib/statsample/analysis/suitereportbuilder.rb +38 -0
  40. data/lib/statsample/bivariate.rb +70 -16
  41. data/lib/statsample/dataset.rb +25 -19
  42. data/lib/statsample/dominanceanalysis.rb +2 -2
  43. data/lib/statsample/factor.rb +2 -0
  44. data/lib/statsample/factor/map.rb +16 -10
  45. data/lib/statsample/factor/parallelanalysis.rb +9 -3
  46. data/lib/statsample/factor/pca.rb +28 -32
  47. data/lib/statsample/factor/rotation.rb +15 -8
  48. data/lib/statsample/graph/boxplot.rb +3 -4
  49. data/lib/statsample/graph/histogram.rb +2 -1
  50. data/lib/statsample/graph/scatterplot.rb +1 -0
  51. data/lib/statsample/matrix.rb +106 -16
  52. data/lib/statsample/regression.rb +4 -1
  53. data/lib/statsample/regression/binomial.rb +1 -1
  54. data/lib/statsample/regression/multiple/baseengine.rb +19 -9
  55. data/lib/statsample/regression/multiple/gslengine.rb +127 -126
  56. data/lib/statsample/regression/multiple/matrixengine.rb +8 -5
  57. data/lib/statsample/regression/multiple/rubyengine.rb +1 -1
  58. data/lib/statsample/regression/simple.rb +31 -6
  59. data/lib/statsample/reliability.rb +11 -3
  60. data/lib/statsample/reliability/scaleanalysis.rb +4 -4
  61. data/lib/statsample/shorthand.rb +81 -0
  62. data/lib/statsample/test/chisquare.rb +1 -1
  63. data/lib/statsample/vector.rb +163 -163
  64. data/lib/statsample/vector/gsl.rb +106 -0
  65. data/references.txt +2 -2
  66. data/{data → test/fixtures}/crime.txt +0 -0
  67. data/{data → test/fixtures}/hartman_23.matrix +0 -0
  68. data/{data → test/fixtures}/repeated_fields.csv +0 -0
  69. data/{data → test/fixtures}/test_binomial.csv +0 -0
  70. data/test/{test_csv.csv → fixtures/test_csv.csv} +0 -0
  71. data/test/{test_xls.xls → fixtures/test_xls.xls} +0 -0
  72. data/{data → test/fixtures}/tetmat_matrix.txt +0 -0
  73. data/{data → test/fixtures}/tetmat_test.txt +0 -0
  74. data/test/helpers_tests.rb +18 -2
  75. data/test/test_analysis.rb +118 -0
  76. data/test/test_anovatwoway.rb +1 -1
  77. data/test/test_anovatwowaywithdataset.rb +1 -1
  78. data/test/test_anovawithvectors.rb +1 -2
  79. data/test/test_bartlettsphericity.rb +1 -2
  80. data/test/test_bivariate.rb +64 -22
  81. data/test/test_codification.rb +1 -2
  82. data/test/test_crosstab.rb +1 -2
  83. data/test/test_csv.rb +3 -4
  84. data/test/test_dataset.rb +24 -3
  85. data/test/test_dominance_analysis.rb +1 -2
  86. data/test/test_factor.rb +8 -69
  87. data/test/test_factor_map.rb +43 -0
  88. data/test/test_factor_pa.rb +54 -0
  89. data/test/test_ggobi.rb +1 -1
  90. data/test/test_gsl.rb +12 -18
  91. data/test/test_histogram.rb +1 -2
  92. data/test/test_logit.rb +62 -18
  93. data/test/test_matrix.rb +4 -5
  94. data/test/test_mle.rb +3 -4
  95. data/test/test_regression.rb +21 -2
  96. data/test/test_reliability.rb +3 -3
  97. data/test/test_reliability_icc.rb +1 -1
  98. data/test/test_reliability_skillscale.rb +20 -4
  99. data/test/test_resample.rb +1 -2
  100. data/test/test_rserve_extension.rb +1 -2
  101. data/test/test_srs.rb +1 -2
  102. data/test/test_statistics.rb +1 -2
  103. data/test/test_stest.rb +1 -2
  104. data/test/test_stratified.rb +1 -2
  105. data/test/test_test_f.rb +1 -2
  106. data/test/test_test_t.rb +1 -2
  107. data/test/test_umannwhitney.rb +1 -2
  108. data/test/test_vector.rb +117 -18
  109. data/test/test_xls.rb +2 -3
  110. data/web/Rakefile +39 -0
  111. metadata +109 -29
  112. metadata.gz.sig +0 -0
  113. data/examples/parallel_analysis_tetrachoric.rb +0 -31
  114. data/lib/distribution.rb +0 -25
  115. data/lib/distribution/chisquare.rb +0 -23
  116. data/lib/distribution/f.rb +0 -35
  117. data/lib/distribution/normal.rb +0 -60
  118. data/lib/distribution/normalbivariate.rb +0 -284
  119. data/lib/distribution/normalmultivariate.rb +0 -73
  120. data/lib/distribution/t.rb +0 -55
  121. data/test/test_distribution.rb +0 -73
@@ -30,13 +30,13 @@ class MatrixEngine < BaseEngine
30
30
 
31
31
  # Number of cases
32
32
  attr_writer :cases
33
-
33
+ attr_writer :digits
34
34
  # Create object
35
35
  #
36
36
  def initialize(matrix,y_var, opts=Hash.new)
37
37
  matrix.extend Statsample::CovariateMatrix
38
38
  raise "#{y_var} variable should be on data" unless matrix.fields.include? y_var
39
- if matrix.type==:covariance
39
+ if matrix._type==:covariance
40
40
  @matrix_cov=matrix
41
41
  @matrix_cor=matrix.correlation
42
42
  @no_covariance=false
@@ -53,6 +53,8 @@ class MatrixEngine < BaseEngine
53
53
  @predictors_n=@n_predictors
54
54
  @matrix_x= @matrix_cor.submatrix(@fields)
55
55
  @matrix_x_cov= @matrix_cov.submatrix(@fields)
56
+ raise LinearDependency, "Regressors are linearly dependent" if @matrix_x.determinant<1e-15
57
+
56
58
 
57
59
  @matrix_y = @matrix_cor.submatrix(@fields, [y_var])
58
60
  @matrix_y_cov = @matrix_cov.submatrix(@fields, [y_var])
@@ -75,13 +77,14 @@ class MatrixEngine < BaseEngine
75
77
  @y_mean=0.0
76
78
  @name=_("Multiple reggresion of %s on %s") % [@fields.join(","), @y_var]
77
79
 
78
-
80
+ opts_default={:digits=>3}
81
+ opts=opts_default.merge opts
79
82
  opts.each{|k,v|
80
83
  self.send("#{k}=",v) if self.respond_to? k
81
84
  }
82
85
  result_matrix=@matrix_x_cov.inverse * @matrix_y_cov
83
86
 
84
- if matrix.type==:covariance
87
+ if matrix._type==:covariance
85
88
  @coeffs=result_matrix.column(0).to_a
86
89
  @coeffs_stan=coeffs.collect {|k,v|
87
90
  coeffs[k]*@x_sd[k].quo(@y_sd)
@@ -141,7 +144,7 @@ class MatrixEngine < BaseEngine
141
144
  # Tolerance for a given variable
142
145
  # defined as (1-R^2) of regression of other independent variables
143
146
  # over the selected
144
- # Reference:
147
+ # == Reference:
145
148
  # * http://talkstats.com/showthread.php?t=5056
146
149
  def tolerance(var)
147
150
  return 1 if @matrix_x.column_size==1
@@ -17,7 +17,7 @@ module Multiple
17
17
 
18
18
  class RubyEngine < MatrixEngine
19
19
  def initialize(ds,y_var, opts=Hash.new)
20
- matrix=Statsample::Bivariate.correlation_matrix(ds)
20
+ matrix=ds.correlation_matrix
21
21
  fields_indep=ds.fields-[y_var]
22
22
  default={
23
23
  :y_mean=>ds[y_var].mean,
@@ -8,8 +8,10 @@ module Statsample
8
8
  # * <tt> SimpleRegression.new_from_gsl(gsl) </tt>
9
9
  #
10
10
  class Simple
11
+ include Summarizable
11
12
  attr_accessor :a,:b,:cov00, :cov01, :covx1, :chisq, :status
12
-
13
+ attr_accessor :name
14
+ attr_accessor :digits
13
15
  def initialize(init_method, *argv)
14
16
  self.send(init_method, *argv)
15
17
  end
@@ -61,15 +63,15 @@ module Statsample
61
63
  new(:init_gsl, *ar)
62
64
  end
63
65
  # Create a simple regression using two vectors
64
- def new_from_vectors(vx,vy)
65
- new(:init_vectors,vx,vy)
66
+ def new_from_vectors(vx,vy, opts=Hash.new)
67
+ new(:init_vectors,vx,vy, opts)
66
68
  end
67
69
  # Create a simple regression using a dataset and two vector names.
68
- def new_from_dataset(ds,x,y)
69
- new(:init_vectors,ds[x],ds[y])
70
+ def new_from_dataset(ds,x,y, opts=Hash.new)
71
+ new(:init_vectors,ds[x],ds[y], opts)
70
72
  end
71
73
  end
72
- def init_vectors(vx,vy)
74
+ def init_vectors(vx,vy, opts=Hash.new)
73
75
  @vx,@vy=Statsample.only_valid_clone(vx,vy)
74
76
  x_m=@vx.mean
75
77
  y_m=@vy.mean
@@ -80,6 +82,17 @@ module Statsample
80
82
  }
81
83
  @b=num.to_f/den
82
84
  @a=y_m - @b*x_m
85
+
86
+ opts_default={
87
+ :digits=>3,
88
+ :name=>_("Regression of %s over %s") % [@vx.name, @vy.name]
89
+ }
90
+ @opts=opts_default.merge opts
91
+
92
+ @opts.each{|k,v|
93
+ self.send("#{k}=",v) if self.respond_to? k
94
+ }
95
+
83
96
  end
84
97
  def init_gsl(a,b,cov00, cov01, covx1, chisq, status)
85
98
  @a=a
@@ -90,6 +103,18 @@ module Statsample
90
103
  @chisq=chisq
91
104
  @status=status
92
105
  end
106
+ def report_building(gen)
107
+ f="%0.#{digits}f"
108
+ gen.section(:name=>name) do |s|
109
+ s.table(:header=>[_("Variable"), _("Value")]) do |t|
110
+ t.row [_("r"), f % r]
111
+ t.row [_("r^2"), f % r2]
112
+ t.row [_("a"), f % a]
113
+ t.row [_("b"), f % a]
114
+ t.row [_("s.e"), f % standard_error]
115
+ end
116
+ end
117
+ end
93
118
  private :init_vectors, :init_gsl
94
119
  end
95
120
  end
@@ -5,6 +5,7 @@ module Statsample
5
5
  # only uses tuples without missing data
6
6
  def cronbach_alpha(ods)
7
7
  ds=ods.dup_only_valid
8
+ return nil if ds.vectors.any? {|k,v| v.variance==0}
8
9
  n_items=ds.fields.size
9
10
  return nil if n_items<=1
10
11
  s2_items=ds.vectors.inject(0) {|ac,v|
@@ -16,11 +17,18 @@ module Statsample
16
17
  # Calculate Chonbach's alpha for a given dataset
17
18
  # using standarized values for every vector.
18
19
  # Only uses tuples without missing data
19
-
20
+ # Return nil if one or more vectors has 0 variance
20
21
  def cronbach_alpha_standarized(ods)
21
- ds=ods.dup_only_valid.fields.inject({}){|a,f|
22
- a[f]=ods[f].standarized; a
22
+
23
+ ds=ods.dup_only_valid
24
+
25
+ return nil if ds.vectors.any? {|k,v| v.variance==0}
26
+
27
+ ds=ds.fields.inject({}){|a,f|
28
+ a[f]=ods[f].standarized;
29
+ a
23
30
  }.to_dataset
31
+
24
32
  cronbach_alpha(ds)
25
33
  end
26
34
  # Predicted reliability of a test by replicating
@@ -53,12 +53,12 @@ module Statsample
53
53
  @variances=@k.times.map {|i| @cov_m[i,i]}.to_scale
54
54
  @variances_mean=@variances.mean
55
55
  @covariances_mean=(@variance-@variances.sum).quo(@k**2-@k)
56
- begin
56
+ #begin
57
57
  @alpha = Statsample::Reliability.cronbach_alpha(@ds)
58
58
  @alpha_standarized = Statsample::Reliability.cronbach_alpha_standarized(@ds)
59
- rescue => e
60
- raise DatasetException.new(@ds,e), "Error calculating alpha"
61
- end
59
+ #rescue => e
60
+ # raise DatasetException.new(@ds,e), "Error calculating alpha"
61
+ #end
62
62
  end
63
63
  # Returns a hash with structure
64
64
  def item_characteristic_curve
@@ -0,0 +1,81 @@
1
+ module Statsample
2
+ # Module which provide shorthands for many methods.
3
+ module Shorthand
4
+ ###
5
+ # :section: R like methods
6
+ ###
7
+
8
+ # Retrieve names (fields) from dataset
9
+ def names(ds)
10
+ ds.fields
11
+ end
12
+ # Create a correlation matrix from a dataset
13
+ def cor(ds)
14
+ Statsample::Bivariate.correlation_matrix(ds)
15
+ end
16
+ # Create a variance/covariance matrix from a dataset
17
+ def cov(ds)
18
+ Statsample::Bivariate.covariate_matrix(ds)
19
+ end
20
+ # Create a Statsample::Vector
21
+ # Analog to R's c
22
+ def c(*args)
23
+ Statsample::Vector[*args]
24
+ end
25
+ # Random generation for the normal distribution
26
+ def rnorm(n,mean=0,sd=1)
27
+ rng=Distribution::Normal.rng(mean,sd)
28
+ Statsample::Vector.new_scale(n) { rng.call}
29
+ end
30
+ # Creates a new Statsample::Dataset
31
+ # Each key is transformed into string
32
+ def dataset(vectors=Hash.new)
33
+ vectors=vectors.inject({}) {|ac,v| ac[v[0].to_s]=v[1];ac}
34
+ Statsample::Dataset.new(vectors)
35
+ end
36
+ alias :data_frame :dataset
37
+ # Returns a Statsample::Graph::Boxplot
38
+ def boxplot(*args)
39
+ Statsample::Graph::Boxplot.new(*args)
40
+ end
41
+ # Returns a Statsample::Graph::Histogram
42
+ def histogram(*args)
43
+ Statsample::Graph::Histogram.new(*args)
44
+ end
45
+
46
+ # Returns a Statsample::Graph::Scatterplot
47
+ def scatterplot(*args)
48
+ Statsample::Graph::Scatterplot.new(*args)
49
+ end
50
+ # Returns a Statsample::Test::Levene
51
+ def levene(*args)
52
+ Statsample::Test::Levene.new(*args)
53
+ end
54
+ def principal_axis(*args)
55
+ Statsample::Factor::PrincipalAxis.new(*args)
56
+
57
+ end
58
+ def polychoric(*args)
59
+ Statsample::Bivariate::Polychoric.new(*args)
60
+ end
61
+ def tetrachoric(*args)
62
+ Statsample::Bivariate::Tetrachoric.new(*args)
63
+ end
64
+
65
+ ###
66
+ # Other Shortcuts
67
+ ###
68
+ def lr(*args)
69
+ Statsample::Regression.multiple(*args)
70
+ end
71
+ def pca(ds,opts=Hash.new)
72
+ Statsample::Factor::PCA.new(ds,opts)
73
+ end
74
+ def dominance_analysis(*args)
75
+ Statsample::DominanceAnalysis.new(*args)
76
+ end
77
+ def dominance_analysis_bootstrap(*args)
78
+ Statsample::DominanceAnalysis::Bootstrap.new(*args)
79
+ end
80
+ end
81
+ end
@@ -26,7 +26,7 @@ module Statsample
26
26
  @value
27
27
  end
28
28
  def probability
29
- 1-Distribution::ChiSquare.cdf(@value,@df)
29
+ 1-Distribution::ChiSquare.cdf(@value.to_f,@df)
30
30
  end
31
31
  def compute_chi
32
32
  sum=0
@@ -1,17 +1,31 @@
1
1
  require 'date'
2
- class Array
3
- # Creates a new Statsample::Vector object
4
- # Argument should be equal to Vector.new
5
- def to_vector(*args)
2
+ require 'statsample/vector/gsl'
3
+
4
+ module Statsample::VectorShorthands
5
+ # Creates a new Statsample::Vector object
6
+ # Argument should be equal to Vector.new
7
+ def to_vector(*args)
6
8
  Statsample::Vector.new(self,*args)
7
9
  end
8
- # Creates a new Statsample::Vector object of type :scale
9
- def to_scale(*args)
10
- Statsample::Vector.new(self, :scale,*args)
11
- end
10
+ # Creates a new Statsample::Vector object of type :scale
11
+ def to_scale(*args)
12
+ Statsample::Vector.new(self, :scale, *args)
13
+ end
14
+ end
15
+
16
+ class Array
17
+ include Statsample::VectorShorthands
12
18
  end
13
19
 
20
+ if Statsample.has_gsl?
21
+ module GSL
22
+ class Vector
23
+ include Statsample::VectorShorthands
24
+ end
25
+ end
26
+ end
14
27
  module Statsample
28
+
15
29
 
16
30
  # Collection of values on one dimension. Works as a column on a Spreadsheet.
17
31
  #
@@ -41,26 +55,21 @@ module Statsample
41
55
  attr_reader :data_with_nils
42
56
  # Date date, with all missing values replaced by nils
43
57
  attr_reader :date_data_with_nils
44
- # GSL Object, only available with rbgsl extension and type==:scale
45
- attr_reader :gsl
46
58
  # Change label for specific values
47
59
  attr_accessor :labels
48
60
  # Name of vector. Should be used for output by many classes
49
61
  attr_accessor :name
50
62
 
51
- #
52
63
  # Creates a new Vector object.
53
- # * <tt>data</tt> Array of data.
64
+ # * <tt>data</tt> Any data which can be converted on Array
54
65
  # * <tt>type</tt> Level of meausurement. See Vector#type
55
66
  # * <tt>opts</tt> Hash of options
56
67
  # * <tt>:missing_values</tt> Array of missing values. See Vector#missing_values
57
68
  # * <tt>:today_values</tt> Array of 'today' values. See Vector#today_values
58
69
  # * <tt>:labels</tt> Labels for data values
59
70
  # * <tt>:name</tt> Name of vector
60
- #
61
71
  def initialize(data=[], type=:nominal, opts=Hash.new)
62
- raise "Data should be an array" unless data.is_a? Array
63
- @data=data
72
+ @data=data.is_a?(Array) ? data : data.to_a
64
73
  @type=type
65
74
  opts_default={
66
75
  :missing_values=>[],
@@ -84,9 +93,46 @@ module Statsample
84
93
  @missing_data=[]
85
94
  @has_missing_data=nil
86
95
  @scale_data=nil
87
- set_valid_data_intern
96
+ set_valid_data
88
97
  self.type=type
89
98
  end
99
+ # Create a vector using (almost) any object
100
+ # * Array: flattened
101
+ # * Range: transformed using to_a
102
+ # * Statsample::Vector
103
+ # * Numeric and string values
104
+ def self.[](*args)
105
+ values=[]
106
+ args.each do |a|
107
+ case a
108
+ when Array
109
+ values.concat a.flatten
110
+ when Statsample::Vector
111
+ values.concat a.to_a
112
+ when Range
113
+ values.concat a.to_a
114
+ else
115
+ values << a
116
+ end
117
+ end
118
+ vector=new(values)
119
+ vector.type=:scale if vector.can_be_scale?
120
+ vector
121
+ end
122
+ # Create a new scale type vector
123
+ # Parameters
124
+ # [n] Size
125
+ # [val] Value of each value
126
+ # [&block] If block provided, is used to set the values of vector
127
+ def self.new_scale(n,val=nil, &block)
128
+ if block
129
+ vector=n.times.map {|i| block.call(i)}.to_scale
130
+ else
131
+ vector=n.times.map { val}.to_scale
132
+ end
133
+ vector.type=:scale
134
+ vector
135
+ end
90
136
  # Creates a duplicate of the Vector.
91
137
  # Note: data, missing_values and labels are duplicated, so
92
138
  # changes on original vector doesn't propages to copies.
@@ -98,40 +144,48 @@ module Statsample
98
144
  def dup_empty
99
145
  Vector.new([],@type, :missing_values => @missing_values.dup, :labels => @labels.dup, :name=> @name)
100
146
  end
101
- # Raises an exception if type of vector is inferior to t type
102
- def check_type(t)
103
- raise NoMethodError if (t==:scale and @type!=:scale) or (t==:ordinal and @type==:nominal) or (t==:date)
147
+
148
+ if Statsample::STATSAMPLE__.respond_to?(:check_type)
149
+ # Raises an exception if type of vector is inferior to t type
150
+ def check_type(t)
151
+ Statsample::STATSAMPLE__.check_type(self,t)
152
+ end
153
+ else
154
+ def check_type(t) #:nodoc:
155
+ _check_type(t)
156
+ end
157
+ end
158
+
159
+
160
+ def _check_type(t) #:nodoc:
161
+ raise NoMethodError if (t==:scale and @type!=:scale) or (t==:ordinal and @type==:nominal) or (t==:date) or (:date==@type)
104
162
  end
105
- private :check_type
106
163
 
164
+ def vector_standarized_compute(m,sd) # :nodoc:
165
+ @data_with_nils.collect{|x| x.nil? ? nil : (x.to_f - m).quo(sd) }.to_vector(:scale)
166
+ end
107
167
  # Return a vector usign the standarized values for data
108
168
  # with sd with denominator n-1. With variance=0 or mean nil,
109
169
  # returns a vector of equal size full of nils
110
170
  #
111
-
112
171
  def vector_standarized(use_population=false)
113
172
  check_type :scale
114
- return ([nil]*size).to_scale if mean.nil?
115
173
  m=mean
116
174
  sd=use_population ? sdp : sds
117
- return ([nil]*size).to_scale if sd==0.0
118
- vector=@data_with_nils.collect{|x|
119
- if !x.nil?
120
- (x.to_f - m).quo(sd)
121
- else
122
- nil
123
- end
124
- }.to_vector(:scale)
175
+ return ([nil]*size).to_scale if mean.nil? or sd==0.0
176
+ vector=vector_standarized_compute(m,sd)
125
177
  vector.name=_("%s(standarized)") % @name
126
178
  vector
127
179
  end
180
+ def vector_centered_compute(m) #:nodoc:
181
+ @data_with_nils.collect {|x| x.nil? ? nil : x.to_f-m }.to_scale
182
+ end
128
183
  # Return a centered vector
129
184
  def vector_centered
130
185
  check_type :scale
131
186
  m=mean
132
- vector=@data_with_nils.collect {|x|
133
- x.nil? ? nil : x.to_f-m
134
- }.to_scale
187
+ return ([nil]*size).to_scale if mean.nil?
188
+ vector=vector_centered_compute(m)
135
189
  vector.name=_("%s(centered)") % @name
136
190
  vector
137
191
  end
@@ -148,18 +202,18 @@ module Statsample
148
202
  vector
149
203
  end
150
204
  def box_cox_transformation(lambda) # :nodoc:
151
- raise "Should be a scale" unless @type==:scale
152
- @data_with_nils.collect{|x|
153
- if !x.nil?
154
- if(lambda==0)
155
- Math.log(x)
205
+ raise "Should be a scale" unless @type==:scale
206
+ @data_with_nils.collect{|x|
207
+ if !x.nil?
208
+ if(lambda==0)
209
+ Math.log(x)
210
+ else
211
+ (x**lambda-1).quo(lambda)
212
+ end
156
213
  else
157
- (x**lambda-1).quo(lambda)
214
+ nil
158
215
  end
159
- else
160
- nil
161
- end
162
- }.to_vector(:scale)
216
+ }.to_vector(:scale)
163
217
  end
164
218
 
165
219
  # Vector equality.
@@ -193,6 +247,10 @@ module Statsample
193
247
  }
194
248
  set_valid_data
195
249
  end
250
+ def push(v)
251
+ @data.push(v)
252
+ set_valid_data
253
+ end
196
254
  # Dicotomize the vector with 0 and 1, based on lowest value
197
255
  # If parameter if defined, this value and lower
198
256
  # will be 0 and higher, 1
@@ -250,7 +308,6 @@ module Statsample
250
308
  @missing_data.clear
251
309
  @data_with_nils.clear
252
310
  @date_data_with_nils.clear
253
- @gsl=nil
254
311
  set_valid_data_intern
255
312
  set_scale_data if(@type==:scale)
256
313
  set_date_data if(@type==:date)
@@ -281,11 +338,14 @@ module Statsample
281
338
  def has_missing_data?
282
339
  @has_missing_data
283
340
  end
341
+ alias :flawed? :has_missing_data?
342
+
284
343
  # Retrieves label for value x. Retrieves x if
285
344
  # no label defined.
286
345
  def labeling(x)
287
346
  @labels.has_key?(x) ? @labels[x].to_s : x.to_s
288
347
  end
348
+ alias :label :labeling
289
349
  # Returns a Vector with data with labels replaced by the label.
290
350
  def vector_labeled
291
351
  d=@data.collect{|x|
@@ -317,8 +377,7 @@ module Statsample
317
377
  !(x.nil? or @missing_values.include? x)
318
378
  end
319
379
  # Set missing_values.
320
- # if update_valid = false, you should use
321
- # set_valid_data after all changes
380
+ # set_valid_data is called after changes
322
381
  def missing_values=(vals)
323
382
  @missing_values = vals
324
383
  set_valid_data
@@ -335,7 +394,11 @@ module Statsample
335
394
  set_date_data if (t==:date)
336
395
  end
337
396
  def to_a
338
- @data.dup
397
+ if @data.is_a? Array
398
+ @data.dup
399
+ else
400
+ @data.to_a
401
+ end
339
402
  end
340
403
  alias_method :to_ary, :to_a
341
404
 
@@ -357,6 +420,10 @@ module Statsample
357
420
  def -(v)
358
421
  _vector_ari("-",v)
359
422
  end
423
+
424
+ def *(v)
425
+ _vector_ari("*",v)
426
+ end
360
427
  # Reports all values that doesn't comply with a condition.
361
428
  # Returns a hash with the index of data and the invalid data.
362
429
  def verify
@@ -370,20 +437,16 @@ module Statsample
370
437
  end
371
438
  def _vector_ari(method,v) # :nodoc:
372
439
  if(v.is_a? Vector or v.is_a? Array)
373
- if v.size==@data.size
374
- # i=0
440
+ raise ArgumentError, "The array/vector parameter (#{v.size}) should be of the same size of the original vector (#{@data.size})" unless v.size==@data.size
375
441
  sum=[]
376
- 0.upto(v.size-1) {|i|
442
+ v.size.times {|i|
377
443
  if((v.is_a? Vector and v.is_valid?(v[i]) and is_valid?(@data[i])) or (v.is_a? Array and !v[i].nil? and !data[i].nil?))
378
444
  sum.push(@data[i].send(method,v[i]))
379
445
  else
380
446
  sum.push(nil)
381
447
  end
382
448
  }
383
- Statsample::Vector.new(sum, :scale )
384
- else
385
- raise ArgumentError, "The array/vector parameter should be of the same size of the original vector"
386
- end
449
+ Statsample::Vector.new(sum, :scale)
387
450
  elsif(v.respond_to? method )
388
451
  Statsample::Vector.new(
389
452
  @data.collect {|x|
@@ -451,10 +514,10 @@ module Statsample
451
514
  }
452
515
  end
453
516
  def split_by_separator_freq(sep=Statsample::SPLIT_TOKEN)
454
- split_by_separator(sep).inject({}) {|a,v|
455
- a[v[0]]=v[1].inject {|s,x| s+x.to_i}
456
- a
457
- }
517
+ split_by_separator(sep).inject({}) {|a,v|
518
+ a[v[0]]=v[1].inject {|s,x| s+x.to_i}
519
+ a
520
+ }
458
521
  end
459
522
 
460
523
  # Returns an random sample of size n, with replacement,
@@ -463,13 +526,8 @@ module Statsample
463
526
  # In all the trails, every item have the same probability
464
527
  # of been selected.
465
528
  def sample_with_replacement(sample=1)
466
- if(@type!=:scale or !Statsample.has_gsl?)
467
- vds=@valid_data.size
468
- (0...sample).collect{ @valid_data[rand(vds)] }
469
- else
470
- r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
471
- r.sample(@gsl, sample).to_a
472
- end
529
+ vds=@valid_data.size
530
+ (0...sample).collect{ @valid_data[rand(vds)] }
473
531
  end
474
532
  # Returns an random sample of size n, without replacement,
475
533
  # only with valid data.
@@ -479,7 +537,6 @@ module Statsample
479
537
  # A sample of the same size of the vector is the vector itself.
480
538
 
481
539
  def sample_without_replacement(sample=1)
482
- if(@type!=:scale or !Statsample.has_gsl?)
483
540
  raise ArgumentError, "Sample size couldn't be greater than n" if sample>@valid_data.size
484
541
  out=[]
485
542
  size=@valid_data.size
@@ -487,11 +544,7 @@ module Statsample
487
544
  value=rand(size)
488
545
  out.push(value) if !out.include?value
489
546
  end
490
- out.collect{|i|@data[i]}
491
- else
492
- r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
493
- r.choose(@gsl, sample).to_a
494
- end
547
+ out.collect{|i| @data[i]}
495
548
  end
496
549
  # Retrieves number of cases which comply condition.
497
550
  # If block given, retrieves number of instances where
@@ -535,11 +588,11 @@ module Statsample
535
588
  end
536
589
  # Return true if all data is Numeric or nil
537
590
  def can_be_scale?
538
- if @data.find {|v| !v.nil? and !v.is_a? Numeric and !@missing_values.include? v}
539
- false
540
- else
541
- true
542
- end
591
+ if @data.find {|v| !v.nil? and !v.is_a? Numeric and !@missing_values.include? v}
592
+ false
593
+ else
594
+ true
595
+ end
543
596
  end
544
597
 
545
598
  def to_s
@@ -560,13 +613,13 @@ module Statsample
560
613
  end
561
614
  # Retrieves uniques values for data.
562
615
  def factors
563
- if @type==:scale
564
- @scale_data.uniq.sort
565
- elsif @type==:date
566
- @date_data_with_nils.uniq.sort
567
- else
568
- @valid_data.uniq.sort
569
- end
616
+ if @type==:scale
617
+ @scale_data.uniq.sort
618
+ elsif @type==:date
619
+ @date_data_with_nils.uniq.sort
620
+ else
621
+ @valid_data.uniq.sort
622
+ end
570
623
  end
571
624
  if Statsample::STATSAMPLE__.respond_to?(:frequencies)
572
625
  # Returns a hash with the distribution of frecuencies for
@@ -579,6 +632,8 @@ module Statsample
579
632
  _frequencies
580
633
  end
581
634
  end
635
+
636
+
582
637
  def _frequencies #:nodoc:
583
638
  @valid_data.inject(Hash.new) {|a,x|
584
639
  a[x]||=0
@@ -589,7 +644,7 @@ module Statsample
589
644
 
590
645
  # Returns the most frequent item.
591
646
  def mode
592
- frequencies.max{|a,b| a[1]<=>b[1]}[0]
647
+ frequencies.max{|a,b| a[1]<=>b[1]}.first
593
648
  end
594
649
  # The numbers of item with valid data.
595
650
  def n_valid
@@ -678,22 +733,17 @@ module Statsample
678
733
  # Return the median (percentil 50)
679
734
  def median
680
735
  check_type :ordinal
681
- if Statsample.has_gsl? and @type==:scale
682
- sorted=GSL::Vector.alloc(@scale_data.sort)
683
- GSL::Stats::median_from_sorted_data(sorted)
684
- else
685
- percentil(50)
686
- end
736
+ percentil(50)
687
737
  end
688
738
  # Minimun value
689
739
  def min
690
740
  check_type :ordinal
691
- @valid_data.min;
741
+ @valid_data.min
692
742
  end
693
743
  # Maximum value
694
744
  def max
695
745
  check_type :ordinal
696
- @valid_data.max;
746
+ @valid_data.max
697
747
  end
698
748
 
699
749
  def set_date_data
@@ -722,9 +772,6 @@ module Statsample
722
772
  x.to_f
723
773
  end
724
774
  end
725
- if Statsample.has_gsl?
726
- @gsl=GSL::Vector.alloc(@scale_data) if @scale_data.size>0
727
- end
728
775
  end
729
776
 
730
777
  private :set_date_data, :set_scale_data
@@ -791,7 +838,6 @@ module Statsample
791
838
  # Sample Standard deviation (denominator n-1)
792
839
  def standard_deviation_sample(m=nil)
793
840
  check_type :scale
794
-
795
841
  m||=mean
796
842
  Math::sqrt(variance_sample(m))
797
843
  end
@@ -816,76 +862,30 @@ module Statsample
816
862
  check_type :scale
817
863
  @scale_data.inject(1){|a,x| a*x }
818
864
  end
819
- if Statsample.has_gsl?
820
- %w{skew kurtosis variance_sample standard_deviation_sample variance_population standard_deviation_population mean sum}.each{|m|
821
- m_nuevo=(m+"_slow").intern
822
- alias_method m_nuevo, m.intern
823
- }
824
- def sum # :nodoc:
825
- check_type :scale
826
-
827
- @gsl.sum
828
- end
829
- def mean # :nodoc:
865
+
866
+ # With a fixnum, creates X bins within the range of data
867
+ # With an Array, each value will be a cut point
868
+ def histogram(bins=10)
830
869
  check_type :scale
831
- @gsl.nil? ? nil : @gsl.mean
832
- end
833
- def variance_sample(m=nil) # :nodoc:
834
- check_type :scale
835
- m||=mean
836
- @gsl.variance_m
837
- end
838
- def standard_deviation_sample(m=nil) # :nodoc:
839
- check_type :scale
840
- return nil if @gsl.nil?
841
- m||=mean
842
- @gsl.sd(m)
843
- end
844
870
 
845
- def variance_population(m=nil) # :nodoc:
846
- check_type :scale
847
- m||=mean
848
- @gsl.variance_with_fixed_mean(m)
849
- end
850
- def standard_deviation_population(m=nil) # :nodoc:
851
- check_type :scale
852
- m||=mean
853
- @gsl.sd_with_fixed_mean(m)
854
- end
855
- def skew # :nodoc:
856
- check_type :scale
857
- @gsl.skew
858
- end
859
- def kurtosis # :nodoc:
860
- check_type :scale
861
- @gsl.kurtosis
862
- end
863
- # Create a GSL::Histogram
864
- # With a fixnum, creates X bins within the range of data
865
- # With an Array, each value will be a cut point
866
- def histogram(bins=10)
867
- check_type :scale
868
-
869
- if bins.is_a? Array
870
- #h=Statsample::Histogram.new(self, bins)
871
- h=Statsample::Histogram.alloc(bins)
872
- else
873
- # ugly patch. The upper limit for a bin has the form
874
- # x < range
875
- #h=Statsample::Histogram.new(self, bins)
876
- min,max=Statsample::Util.nice(@valid_data.min,@valid_data.max)
877
- # fix last data
878
- if max==@valid_data.max
879
- max+=1e-10
880
- end
881
- h=Statsample::Histogram.alloc(bins,[min,max])
882
- # Fix last bin
883
-
871
+ if bins.is_a? Array
872
+ #h=Statsample::Histogram.new(self, bins)
873
+ h=Statsample::Histogram.alloc(bins)
874
+ else
875
+ # ugly patch. The upper limit for a bin has the form
876
+ # x < range
877
+ #h=Statsample::Histogram.new(self, bins)
878
+ min,max=Statsample::Util.nice(@valid_data.min,@valid_data.max)
879
+ # fix last data
880
+ if max==@valid_data.max
881
+ max+=1e-10
884
882
  end
885
- h.increment(@valid_data)
886
- h
883
+ h=Statsample::Histogram.alloc(bins,[min,max])
884
+ # Fix last bin
885
+
887
886
  end
888
-
887
+ h.increment(@valid_data)
888
+ h
889
889
  end
890
890
 
891
891
  # Coefficient of variation
@@ -894,7 +894,6 @@ module Statsample
894
894
  check_type :scale
895
895
  standard_deviation_sample.quo(mean)
896
896
  end
897
-
898
897
  alias_method :sdp, :standard_deviation_population
899
898
  alias_method :sds, :standard_deviation_sample
900
899
  alias_method :adp, :average_deviation_population
@@ -902,5 +901,6 @@ module Statsample
902
901
  alias_method :variance, :variance_sample
903
902
  alias_method :sd, :standard_deviation_sample
904
903
  alias_method :ss, :sum_of_squares
904
+ include_aliasing Statsample::Vector::GSL_ if Statsample.has_gsl?
905
905
  end
906
906
  end