statsample 0.18.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. data.tar.gz.sig +0 -0
  2. data/History.txt +23 -0
  3. data/Manifest.txt +28 -17
  4. data/Rakefile +3 -2
  5. data/benchmarks/correlation_matrix_15_variables.rb +31 -0
  6. data/benchmarks/correlation_matrix_5_variables.rb +32 -0
  7. data/benchmarks/correlation_matrix_methods/correlation_matrix.ds +0 -0
  8. data/benchmarks/correlation_matrix_methods/correlation_matrix.html +93 -0
  9. data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +75 -0
  10. data/benchmarks/correlation_matrix_methods/correlation_matrix.xls +0 -0
  11. data/benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods +0 -0
  12. data/benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods +0 -0
  13. data/benchmarks/correlation_matrix_methods/results.ds +0 -0
  14. data/benchmarks/factor_map.rb +37 -0
  15. data/benchmarks/helpers_benchmark.rb +5 -0
  16. data/examples/boxplot.rb +13 -14
  17. data/examples/correlation_matrix.rb +16 -8
  18. data/examples/dataset.rb +13 -4
  19. data/examples/dominance_analysis.rb +23 -17
  20. data/examples/dominance_analysis_bootstrap.rb +28 -22
  21. data/examples/histogram.rb +8 -9
  22. data/examples/icc.rb +20 -21
  23. data/examples/levene.rb +10 -4
  24. data/examples/multiple_regression.rb +9 -28
  25. data/examples/multivariate_correlation.rb +9 -3
  26. data/examples/parallel_analysis.rb +20 -16
  27. data/examples/polychoric.rb +15 -9
  28. data/examples/principal_axis.rb +18 -6
  29. data/examples/reliability.rb +26 -13
  30. data/examples/scatterplot.rb +10 -6
  31. data/examples/t_test.rb +15 -6
  32. data/examples/tetrachoric.rb +9 -2
  33. data/examples/u_test.rb +12 -4
  34. data/examples/vector.rb +13 -2
  35. data/examples/velicer_map_test.rb +33 -26
  36. data/lib/statsample.rb +32 -12
  37. data/lib/statsample/analysis.rb +79 -0
  38. data/lib/statsample/analysis/suite.rb +72 -0
  39. data/lib/statsample/analysis/suitereportbuilder.rb +38 -0
  40. data/lib/statsample/bivariate.rb +70 -16
  41. data/lib/statsample/dataset.rb +25 -19
  42. data/lib/statsample/dominanceanalysis.rb +2 -2
  43. data/lib/statsample/factor.rb +2 -0
  44. data/lib/statsample/factor/map.rb +16 -10
  45. data/lib/statsample/factor/parallelanalysis.rb +9 -3
  46. data/lib/statsample/factor/pca.rb +28 -32
  47. data/lib/statsample/factor/rotation.rb +15 -8
  48. data/lib/statsample/graph/boxplot.rb +3 -4
  49. data/lib/statsample/graph/histogram.rb +2 -1
  50. data/lib/statsample/graph/scatterplot.rb +1 -0
  51. data/lib/statsample/matrix.rb +106 -16
  52. data/lib/statsample/regression.rb +4 -1
  53. data/lib/statsample/regression/binomial.rb +1 -1
  54. data/lib/statsample/regression/multiple/baseengine.rb +19 -9
  55. data/lib/statsample/regression/multiple/gslengine.rb +127 -126
  56. data/lib/statsample/regression/multiple/matrixengine.rb +8 -5
  57. data/lib/statsample/regression/multiple/rubyengine.rb +1 -1
  58. data/lib/statsample/regression/simple.rb +31 -6
  59. data/lib/statsample/reliability.rb +11 -3
  60. data/lib/statsample/reliability/scaleanalysis.rb +4 -4
  61. data/lib/statsample/shorthand.rb +81 -0
  62. data/lib/statsample/test/chisquare.rb +1 -1
  63. data/lib/statsample/vector.rb +163 -163
  64. data/lib/statsample/vector/gsl.rb +106 -0
  65. data/references.txt +2 -2
  66. data/{data → test/fixtures}/crime.txt +0 -0
  67. data/{data → test/fixtures}/hartman_23.matrix +0 -0
  68. data/{data → test/fixtures}/repeated_fields.csv +0 -0
  69. data/{data → test/fixtures}/test_binomial.csv +0 -0
  70. data/test/{test_csv.csv → fixtures/test_csv.csv} +0 -0
  71. data/test/{test_xls.xls → fixtures/test_xls.xls} +0 -0
  72. data/{data → test/fixtures}/tetmat_matrix.txt +0 -0
  73. data/{data → test/fixtures}/tetmat_test.txt +0 -0
  74. data/test/helpers_tests.rb +18 -2
  75. data/test/test_analysis.rb +118 -0
  76. data/test/test_anovatwoway.rb +1 -1
  77. data/test/test_anovatwowaywithdataset.rb +1 -1
  78. data/test/test_anovawithvectors.rb +1 -2
  79. data/test/test_bartlettsphericity.rb +1 -2
  80. data/test/test_bivariate.rb +64 -22
  81. data/test/test_codification.rb +1 -2
  82. data/test/test_crosstab.rb +1 -2
  83. data/test/test_csv.rb +3 -4
  84. data/test/test_dataset.rb +24 -3
  85. data/test/test_dominance_analysis.rb +1 -2
  86. data/test/test_factor.rb +8 -69
  87. data/test/test_factor_map.rb +43 -0
  88. data/test/test_factor_pa.rb +54 -0
  89. data/test/test_ggobi.rb +1 -1
  90. data/test/test_gsl.rb +12 -18
  91. data/test/test_histogram.rb +1 -2
  92. data/test/test_logit.rb +62 -18
  93. data/test/test_matrix.rb +4 -5
  94. data/test/test_mle.rb +3 -4
  95. data/test/test_regression.rb +21 -2
  96. data/test/test_reliability.rb +3 -3
  97. data/test/test_reliability_icc.rb +1 -1
  98. data/test/test_reliability_skillscale.rb +20 -4
  99. data/test/test_resample.rb +1 -2
  100. data/test/test_rserve_extension.rb +1 -2
  101. data/test/test_srs.rb +1 -2
  102. data/test/test_statistics.rb +1 -2
  103. data/test/test_stest.rb +1 -2
  104. data/test/test_stratified.rb +1 -2
  105. data/test/test_test_f.rb +1 -2
  106. data/test/test_test_t.rb +1 -2
  107. data/test/test_umannwhitney.rb +1 -2
  108. data/test/test_vector.rb +117 -18
  109. data/test/test_xls.rb +2 -3
  110. data/web/Rakefile +39 -0
  111. metadata +109 -29
  112. metadata.gz.sig +0 -0
  113. data/examples/parallel_analysis_tetrachoric.rb +0 -31
  114. data/lib/distribution.rb +0 -25
  115. data/lib/distribution/chisquare.rb +0 -23
  116. data/lib/distribution/f.rb +0 -35
  117. data/lib/distribution/normal.rb +0 -60
  118. data/lib/distribution/normalbivariate.rb +0 -284
  119. data/lib/distribution/normalmultivariate.rb +0 -73
  120. data/lib/distribution/t.rb +0 -55
  121. data/test/test_distribution.rb +0 -73
@@ -30,13 +30,13 @@ class MatrixEngine < BaseEngine
30
30
 
31
31
  # Number of cases
32
32
  attr_writer :cases
33
-
33
+ attr_writer :digits
34
34
  # Create object
35
35
  #
36
36
  def initialize(matrix,y_var, opts=Hash.new)
37
37
  matrix.extend Statsample::CovariateMatrix
38
38
  raise "#{y_var} variable should be on data" unless matrix.fields.include? y_var
39
- if matrix.type==:covariance
39
+ if matrix._type==:covariance
40
40
  @matrix_cov=matrix
41
41
  @matrix_cor=matrix.correlation
42
42
  @no_covariance=false
@@ -53,6 +53,8 @@ class MatrixEngine < BaseEngine
53
53
  @predictors_n=@n_predictors
54
54
  @matrix_x= @matrix_cor.submatrix(@fields)
55
55
  @matrix_x_cov= @matrix_cov.submatrix(@fields)
56
+ raise LinearDependency, "Regressors are linearly dependent" if @matrix_x.determinant<1e-15
57
+
56
58
 
57
59
  @matrix_y = @matrix_cor.submatrix(@fields, [y_var])
58
60
  @matrix_y_cov = @matrix_cov.submatrix(@fields, [y_var])
@@ -75,13 +77,14 @@ class MatrixEngine < BaseEngine
75
77
  @y_mean=0.0
76
78
  @name=_("Multiple reggresion of %s on %s") % [@fields.join(","), @y_var]
77
79
 
78
-
80
+ opts_default={:digits=>3}
81
+ opts=opts_default.merge opts
79
82
  opts.each{|k,v|
80
83
  self.send("#{k}=",v) if self.respond_to? k
81
84
  }
82
85
  result_matrix=@matrix_x_cov.inverse * @matrix_y_cov
83
86
 
84
- if matrix.type==:covariance
87
+ if matrix._type==:covariance
85
88
  @coeffs=result_matrix.column(0).to_a
86
89
  @coeffs_stan=coeffs.collect {|k,v|
87
90
  coeffs[k]*@x_sd[k].quo(@y_sd)
@@ -141,7 +144,7 @@ class MatrixEngine < BaseEngine
141
144
  # Tolerance for a given variable
142
145
  # defined as (1-R^2) of regression of other independent variables
143
146
  # over the selected
144
- # Reference:
147
+ # == Reference:
145
148
  # * http://talkstats.com/showthread.php?t=5056
146
149
  def tolerance(var)
147
150
  return 1 if @matrix_x.column_size==1
@@ -17,7 +17,7 @@ module Multiple
17
17
 
18
18
  class RubyEngine < MatrixEngine
19
19
  def initialize(ds,y_var, opts=Hash.new)
20
- matrix=Statsample::Bivariate.correlation_matrix(ds)
20
+ matrix=ds.correlation_matrix
21
21
  fields_indep=ds.fields-[y_var]
22
22
  default={
23
23
  :y_mean=>ds[y_var].mean,
@@ -8,8 +8,10 @@ module Statsample
8
8
  # * <tt> SimpleRegression.new_from_gsl(gsl) </tt>
9
9
  #
10
10
  class Simple
11
+ include Summarizable
11
12
  attr_accessor :a,:b,:cov00, :cov01, :covx1, :chisq, :status
12
-
13
+ attr_accessor :name
14
+ attr_accessor :digits
13
15
  def initialize(init_method, *argv)
14
16
  self.send(init_method, *argv)
15
17
  end
@@ -61,15 +63,15 @@ module Statsample
61
63
  new(:init_gsl, *ar)
62
64
  end
63
65
  # Create a simple regression using two vectors
64
- def new_from_vectors(vx,vy)
65
- new(:init_vectors,vx,vy)
66
+ def new_from_vectors(vx,vy, opts=Hash.new)
67
+ new(:init_vectors,vx,vy, opts)
66
68
  end
67
69
  # Create a simple regression using a dataset and two vector names.
68
- def new_from_dataset(ds,x,y)
69
- new(:init_vectors,ds[x],ds[y])
70
+ def new_from_dataset(ds,x,y, opts=Hash.new)
71
+ new(:init_vectors,ds[x],ds[y], opts)
70
72
  end
71
73
  end
72
- def init_vectors(vx,vy)
74
+ def init_vectors(vx,vy, opts=Hash.new)
73
75
  @vx,@vy=Statsample.only_valid_clone(vx,vy)
74
76
  x_m=@vx.mean
75
77
  y_m=@vy.mean
@@ -80,6 +82,17 @@ module Statsample
80
82
  }
81
83
  @b=num.to_f/den
82
84
  @a=y_m - @b*x_m
85
+
86
+ opts_default={
87
+ :digits=>3,
88
+ :name=>_("Regression of %s over %s") % [@vx.name, @vy.name]
89
+ }
90
+ @opts=opts_default.merge opts
91
+
92
+ @opts.each{|k,v|
93
+ self.send("#{k}=",v) if self.respond_to? k
94
+ }
95
+
83
96
  end
84
97
  def init_gsl(a,b,cov00, cov01, covx1, chisq, status)
85
98
  @a=a
@@ -90,6 +103,18 @@ module Statsample
90
103
  @chisq=chisq
91
104
  @status=status
92
105
  end
106
+ def report_building(gen)
107
+ f="%0.#{digits}f"
108
+ gen.section(:name=>name) do |s|
109
+ s.table(:header=>[_("Variable"), _("Value")]) do |t|
110
+ t.row [_("r"), f % r]
111
+ t.row [_("r^2"), f % r2]
112
+ t.row [_("a"), f % a]
113
+ t.row [_("b"), f % a]
114
+ t.row [_("s.e"), f % standard_error]
115
+ end
116
+ end
117
+ end
93
118
  private :init_vectors, :init_gsl
94
119
  end
95
120
  end
@@ -5,6 +5,7 @@ module Statsample
5
5
  # only uses tuples without missing data
6
6
  def cronbach_alpha(ods)
7
7
  ds=ods.dup_only_valid
8
+ return nil if ds.vectors.any? {|k,v| v.variance==0}
8
9
  n_items=ds.fields.size
9
10
  return nil if n_items<=1
10
11
  s2_items=ds.vectors.inject(0) {|ac,v|
@@ -16,11 +17,18 @@ module Statsample
16
17
  # Calculate Chonbach's alpha for a given dataset
17
18
  # using standarized values for every vector.
18
19
  # Only uses tuples without missing data
19
-
20
+ # Return nil if one or more vectors has 0 variance
20
21
  def cronbach_alpha_standarized(ods)
21
- ds=ods.dup_only_valid.fields.inject({}){|a,f|
22
- a[f]=ods[f].standarized; a
22
+
23
+ ds=ods.dup_only_valid
24
+
25
+ return nil if ds.vectors.any? {|k,v| v.variance==0}
26
+
27
+ ds=ds.fields.inject({}){|a,f|
28
+ a[f]=ods[f].standarized;
29
+ a
23
30
  }.to_dataset
31
+
24
32
  cronbach_alpha(ds)
25
33
  end
26
34
  # Predicted reliability of a test by replicating
@@ -53,12 +53,12 @@ module Statsample
53
53
  @variances=@k.times.map {|i| @cov_m[i,i]}.to_scale
54
54
  @variances_mean=@variances.mean
55
55
  @covariances_mean=(@variance-@variances.sum).quo(@k**2-@k)
56
- begin
56
+ #begin
57
57
  @alpha = Statsample::Reliability.cronbach_alpha(@ds)
58
58
  @alpha_standarized = Statsample::Reliability.cronbach_alpha_standarized(@ds)
59
- rescue => e
60
- raise DatasetException.new(@ds,e), "Error calculating alpha"
61
- end
59
+ #rescue => e
60
+ # raise DatasetException.new(@ds,e), "Error calculating alpha"
61
+ #end
62
62
  end
63
63
  # Returns a hash with structure
64
64
  def item_characteristic_curve
@@ -0,0 +1,81 @@
1
+ module Statsample
2
+ # Module which provide shorthands for many methods.
3
+ module Shorthand
4
+ ###
5
+ # :section: R like methods
6
+ ###
7
+
8
+ # Retrieve names (fields) from dataset
9
+ def names(ds)
10
+ ds.fields
11
+ end
12
+ # Create a correlation matrix from a dataset
13
+ def cor(ds)
14
+ Statsample::Bivariate.correlation_matrix(ds)
15
+ end
16
+ # Create a variance/covariance matrix from a dataset
17
+ def cov(ds)
18
+ Statsample::Bivariate.covariate_matrix(ds)
19
+ end
20
+ # Create a Statsample::Vector
21
+ # Analog to R's c
22
+ def c(*args)
23
+ Statsample::Vector[*args]
24
+ end
25
+ # Random generation for the normal distribution
26
+ def rnorm(n,mean=0,sd=1)
27
+ rng=Distribution::Normal.rng(mean,sd)
28
+ Statsample::Vector.new_scale(n) { rng.call}
29
+ end
30
+ # Creates a new Statsample::Dataset
31
+ # Each key is transformed into string
32
+ def dataset(vectors=Hash.new)
33
+ vectors=vectors.inject({}) {|ac,v| ac[v[0].to_s]=v[1];ac}
34
+ Statsample::Dataset.new(vectors)
35
+ end
36
+ alias :data_frame :dataset
37
+ # Returns a Statsample::Graph::Boxplot
38
+ def boxplot(*args)
39
+ Statsample::Graph::Boxplot.new(*args)
40
+ end
41
+ # Returns a Statsample::Graph::Histogram
42
+ def histogram(*args)
43
+ Statsample::Graph::Histogram.new(*args)
44
+ end
45
+
46
+ # Returns a Statsample::Graph::Scatterplot
47
+ def scatterplot(*args)
48
+ Statsample::Graph::Scatterplot.new(*args)
49
+ end
50
+ # Returns a Statsample::Test::Levene
51
+ def levene(*args)
52
+ Statsample::Test::Levene.new(*args)
53
+ end
54
+ def principal_axis(*args)
55
+ Statsample::Factor::PrincipalAxis.new(*args)
56
+
57
+ end
58
+ def polychoric(*args)
59
+ Statsample::Bivariate::Polychoric.new(*args)
60
+ end
61
+ def tetrachoric(*args)
62
+ Statsample::Bivariate::Tetrachoric.new(*args)
63
+ end
64
+
65
+ ###
66
+ # Other Shortcuts
67
+ ###
68
+ def lr(*args)
69
+ Statsample::Regression.multiple(*args)
70
+ end
71
+ def pca(ds,opts=Hash.new)
72
+ Statsample::Factor::PCA.new(ds,opts)
73
+ end
74
+ def dominance_analysis(*args)
75
+ Statsample::DominanceAnalysis.new(*args)
76
+ end
77
+ def dominance_analysis_bootstrap(*args)
78
+ Statsample::DominanceAnalysis::Bootstrap.new(*args)
79
+ end
80
+ end
81
+ end
@@ -26,7 +26,7 @@ module Statsample
26
26
  @value
27
27
  end
28
28
  def probability
29
- 1-Distribution::ChiSquare.cdf(@value,@df)
29
+ 1-Distribution::ChiSquare.cdf(@value.to_f,@df)
30
30
  end
31
31
  def compute_chi
32
32
  sum=0
@@ -1,17 +1,31 @@
1
1
  require 'date'
2
- class Array
3
- # Creates a new Statsample::Vector object
4
- # Argument should be equal to Vector.new
5
- def to_vector(*args)
2
+ require 'statsample/vector/gsl'
3
+
4
+ module Statsample::VectorShorthands
5
+ # Creates a new Statsample::Vector object
6
+ # Argument should be equal to Vector.new
7
+ def to_vector(*args)
6
8
  Statsample::Vector.new(self,*args)
7
9
  end
8
- # Creates a new Statsample::Vector object of type :scale
9
- def to_scale(*args)
10
- Statsample::Vector.new(self, :scale,*args)
11
- end
10
+ # Creates a new Statsample::Vector object of type :scale
11
+ def to_scale(*args)
12
+ Statsample::Vector.new(self, :scale, *args)
13
+ end
14
+ end
15
+
16
+ class Array
17
+ include Statsample::VectorShorthands
12
18
  end
13
19
 
20
+ if Statsample.has_gsl?
21
+ module GSL
22
+ class Vector
23
+ include Statsample::VectorShorthands
24
+ end
25
+ end
26
+ end
14
27
  module Statsample
28
+
15
29
 
16
30
  # Collection of values on one dimension. Works as a column on a Spreadsheet.
17
31
  #
@@ -41,26 +55,21 @@ module Statsample
41
55
  attr_reader :data_with_nils
42
56
  # Date date, with all missing values replaced by nils
43
57
  attr_reader :date_data_with_nils
44
- # GSL Object, only available with rbgsl extension and type==:scale
45
- attr_reader :gsl
46
58
  # Change label for specific values
47
59
  attr_accessor :labels
48
60
  # Name of vector. Should be used for output by many classes
49
61
  attr_accessor :name
50
62
 
51
- #
52
63
  # Creates a new Vector object.
53
- # * <tt>data</tt> Array of data.
64
+ # * <tt>data</tt> Any data which can be converted on Array
54
65
  # * <tt>type</tt> Level of meausurement. See Vector#type
55
66
  # * <tt>opts</tt> Hash of options
56
67
  # * <tt>:missing_values</tt> Array of missing values. See Vector#missing_values
57
68
  # * <tt>:today_values</tt> Array of 'today' values. See Vector#today_values
58
69
  # * <tt>:labels</tt> Labels for data values
59
70
  # * <tt>:name</tt> Name of vector
60
- #
61
71
  def initialize(data=[], type=:nominal, opts=Hash.new)
62
- raise "Data should be an array" unless data.is_a? Array
63
- @data=data
72
+ @data=data.is_a?(Array) ? data : data.to_a
64
73
  @type=type
65
74
  opts_default={
66
75
  :missing_values=>[],
@@ -84,9 +93,46 @@ module Statsample
84
93
  @missing_data=[]
85
94
  @has_missing_data=nil
86
95
  @scale_data=nil
87
- set_valid_data_intern
96
+ set_valid_data
88
97
  self.type=type
89
98
  end
99
+ # Create a vector using (almost) any object
100
+ # * Array: flattened
101
+ # * Range: transformed using to_a
102
+ # * Statsample::Vector
103
+ # * Numeric and string values
104
+ def self.[](*args)
105
+ values=[]
106
+ args.each do |a|
107
+ case a
108
+ when Array
109
+ values.concat a.flatten
110
+ when Statsample::Vector
111
+ values.concat a.to_a
112
+ when Range
113
+ values.concat a.to_a
114
+ else
115
+ values << a
116
+ end
117
+ end
118
+ vector=new(values)
119
+ vector.type=:scale if vector.can_be_scale?
120
+ vector
121
+ end
122
+ # Create a new scale type vector
123
+ # Parameters
124
+ # [n] Size
125
+ # [val] Value of each value
126
+ # [&block] If block provided, is used to set the values of vector
127
+ def self.new_scale(n,val=nil, &block)
128
+ if block
129
+ vector=n.times.map {|i| block.call(i)}.to_scale
130
+ else
131
+ vector=n.times.map { val}.to_scale
132
+ end
133
+ vector.type=:scale
134
+ vector
135
+ end
90
136
  # Creates a duplicate of the Vector.
91
137
  # Note: data, missing_values and labels are duplicated, so
92
138
  # changes on original vector doesn't propages to copies.
@@ -98,40 +144,48 @@ module Statsample
98
144
  def dup_empty
99
145
  Vector.new([],@type, :missing_values => @missing_values.dup, :labels => @labels.dup, :name=> @name)
100
146
  end
101
- # Raises an exception if type of vector is inferior to t type
102
- def check_type(t)
103
- raise NoMethodError if (t==:scale and @type!=:scale) or (t==:ordinal and @type==:nominal) or (t==:date)
147
+
148
+ if Statsample::STATSAMPLE__.respond_to?(:check_type)
149
+ # Raises an exception if type of vector is inferior to t type
150
+ def check_type(t)
151
+ Statsample::STATSAMPLE__.check_type(self,t)
152
+ end
153
+ else
154
+ def check_type(t) #:nodoc:
155
+ _check_type(t)
156
+ end
157
+ end
158
+
159
+
160
+ def _check_type(t) #:nodoc:
161
+ raise NoMethodError if (t==:scale and @type!=:scale) or (t==:ordinal and @type==:nominal) or (t==:date) or (:date==@type)
104
162
  end
105
- private :check_type
106
163
 
164
+ def vector_standarized_compute(m,sd) # :nodoc:
165
+ @data_with_nils.collect{|x| x.nil? ? nil : (x.to_f - m).quo(sd) }.to_vector(:scale)
166
+ end
107
167
  # Return a vector usign the standarized values for data
108
168
  # with sd with denominator n-1. With variance=0 or mean nil,
109
169
  # returns a vector of equal size full of nils
110
170
  #
111
-
112
171
  def vector_standarized(use_population=false)
113
172
  check_type :scale
114
- return ([nil]*size).to_scale if mean.nil?
115
173
  m=mean
116
174
  sd=use_population ? sdp : sds
117
- return ([nil]*size).to_scale if sd==0.0
118
- vector=@data_with_nils.collect{|x|
119
- if !x.nil?
120
- (x.to_f - m).quo(sd)
121
- else
122
- nil
123
- end
124
- }.to_vector(:scale)
175
+ return ([nil]*size).to_scale if mean.nil? or sd==0.0
176
+ vector=vector_standarized_compute(m,sd)
125
177
  vector.name=_("%s(standarized)") % @name
126
178
  vector
127
179
  end
180
+ def vector_centered_compute(m) #:nodoc:
181
+ @data_with_nils.collect {|x| x.nil? ? nil : x.to_f-m }.to_scale
182
+ end
128
183
  # Return a centered vector
129
184
  def vector_centered
130
185
  check_type :scale
131
186
  m=mean
132
- vector=@data_with_nils.collect {|x|
133
- x.nil? ? nil : x.to_f-m
134
- }.to_scale
187
+ return ([nil]*size).to_scale if mean.nil?
188
+ vector=vector_centered_compute(m)
135
189
  vector.name=_("%s(centered)") % @name
136
190
  vector
137
191
  end
@@ -148,18 +202,18 @@ module Statsample
148
202
  vector
149
203
  end
150
204
  def box_cox_transformation(lambda) # :nodoc:
151
- raise "Should be a scale" unless @type==:scale
152
- @data_with_nils.collect{|x|
153
- if !x.nil?
154
- if(lambda==0)
155
- Math.log(x)
205
+ raise "Should be a scale" unless @type==:scale
206
+ @data_with_nils.collect{|x|
207
+ if !x.nil?
208
+ if(lambda==0)
209
+ Math.log(x)
210
+ else
211
+ (x**lambda-1).quo(lambda)
212
+ end
156
213
  else
157
- (x**lambda-1).quo(lambda)
214
+ nil
158
215
  end
159
- else
160
- nil
161
- end
162
- }.to_vector(:scale)
216
+ }.to_vector(:scale)
163
217
  end
164
218
 
165
219
  # Vector equality.
@@ -193,6 +247,10 @@ module Statsample
193
247
  }
194
248
  set_valid_data
195
249
  end
250
+ def push(v)
251
+ @data.push(v)
252
+ set_valid_data
253
+ end
196
254
  # Dicotomize the vector with 0 and 1, based on lowest value
197
255
  # If parameter if defined, this value and lower
198
256
  # will be 0 and higher, 1
@@ -250,7 +308,6 @@ module Statsample
250
308
  @missing_data.clear
251
309
  @data_with_nils.clear
252
310
  @date_data_with_nils.clear
253
- @gsl=nil
254
311
  set_valid_data_intern
255
312
  set_scale_data if(@type==:scale)
256
313
  set_date_data if(@type==:date)
@@ -281,11 +338,14 @@ module Statsample
281
338
  def has_missing_data?
282
339
  @has_missing_data
283
340
  end
341
+ alias :flawed? :has_missing_data?
342
+
284
343
  # Retrieves label for value x. Retrieves x if
285
344
  # no label defined.
286
345
  def labeling(x)
287
346
  @labels.has_key?(x) ? @labels[x].to_s : x.to_s
288
347
  end
348
+ alias :label :labeling
289
349
  # Returns a Vector with data with labels replaced by the label.
290
350
  def vector_labeled
291
351
  d=@data.collect{|x|
@@ -317,8 +377,7 @@ module Statsample
317
377
  !(x.nil? or @missing_values.include? x)
318
378
  end
319
379
  # Set missing_values.
320
- # if update_valid = false, you should use
321
- # set_valid_data after all changes
380
+ # set_valid_data is called after changes
322
381
  def missing_values=(vals)
323
382
  @missing_values = vals
324
383
  set_valid_data
@@ -335,7 +394,11 @@ module Statsample
335
394
  set_date_data if (t==:date)
336
395
  end
337
396
  def to_a
338
- @data.dup
397
+ if @data.is_a? Array
398
+ @data.dup
399
+ else
400
+ @data.to_a
401
+ end
339
402
  end
340
403
  alias_method :to_ary, :to_a
341
404
 
@@ -357,6 +420,10 @@ module Statsample
357
420
  def -(v)
358
421
  _vector_ari("-",v)
359
422
  end
423
+
424
+ def *(v)
425
+ _vector_ari("*",v)
426
+ end
360
427
  # Reports all values that doesn't comply with a condition.
361
428
  # Returns a hash with the index of data and the invalid data.
362
429
  def verify
@@ -370,20 +437,16 @@ module Statsample
370
437
  end
371
438
  def _vector_ari(method,v) # :nodoc:
372
439
  if(v.is_a? Vector or v.is_a? Array)
373
- if v.size==@data.size
374
- # i=0
440
+ raise ArgumentError, "The array/vector parameter (#{v.size}) should be of the same size of the original vector (#{@data.size})" unless v.size==@data.size
375
441
  sum=[]
376
- 0.upto(v.size-1) {|i|
442
+ v.size.times {|i|
377
443
  if((v.is_a? Vector and v.is_valid?(v[i]) and is_valid?(@data[i])) or (v.is_a? Array and !v[i].nil? and !data[i].nil?))
378
444
  sum.push(@data[i].send(method,v[i]))
379
445
  else
380
446
  sum.push(nil)
381
447
  end
382
448
  }
383
- Statsample::Vector.new(sum, :scale )
384
- else
385
- raise ArgumentError, "The array/vector parameter should be of the same size of the original vector"
386
- end
449
+ Statsample::Vector.new(sum, :scale)
387
450
  elsif(v.respond_to? method )
388
451
  Statsample::Vector.new(
389
452
  @data.collect {|x|
@@ -451,10 +514,10 @@ module Statsample
451
514
  }
452
515
  end
453
516
  def split_by_separator_freq(sep=Statsample::SPLIT_TOKEN)
454
- split_by_separator(sep).inject({}) {|a,v|
455
- a[v[0]]=v[1].inject {|s,x| s+x.to_i}
456
- a
457
- }
517
+ split_by_separator(sep).inject({}) {|a,v|
518
+ a[v[0]]=v[1].inject {|s,x| s+x.to_i}
519
+ a
520
+ }
458
521
  end
459
522
 
460
523
  # Returns an random sample of size n, with replacement,
@@ -463,13 +526,8 @@ module Statsample
463
526
  # In all the trails, every item have the same probability
464
527
  # of been selected.
465
528
  def sample_with_replacement(sample=1)
466
- if(@type!=:scale or !Statsample.has_gsl?)
467
- vds=@valid_data.size
468
- (0...sample).collect{ @valid_data[rand(vds)] }
469
- else
470
- r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
471
- r.sample(@gsl, sample).to_a
472
- end
529
+ vds=@valid_data.size
530
+ (0...sample).collect{ @valid_data[rand(vds)] }
473
531
  end
474
532
  # Returns an random sample of size n, without replacement,
475
533
  # only with valid data.
@@ -479,7 +537,6 @@ module Statsample
479
537
  # A sample of the same size of the vector is the vector itself.
480
538
 
481
539
  def sample_without_replacement(sample=1)
482
- if(@type!=:scale or !Statsample.has_gsl?)
483
540
  raise ArgumentError, "Sample size couldn't be greater than n" if sample>@valid_data.size
484
541
  out=[]
485
542
  size=@valid_data.size
@@ -487,11 +544,7 @@ module Statsample
487
544
  value=rand(size)
488
545
  out.push(value) if !out.include?value
489
546
  end
490
- out.collect{|i|@data[i]}
491
- else
492
- r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
493
- r.choose(@gsl, sample).to_a
494
- end
547
+ out.collect{|i| @data[i]}
495
548
  end
496
549
  # Retrieves number of cases which comply condition.
497
550
  # If block given, retrieves number of instances where
@@ -535,11 +588,11 @@ module Statsample
535
588
  end
536
589
  # Return true if all data is Numeric or nil
537
590
  def can_be_scale?
538
- if @data.find {|v| !v.nil? and !v.is_a? Numeric and !@missing_values.include? v}
539
- false
540
- else
541
- true
542
- end
591
+ if @data.find {|v| !v.nil? and !v.is_a? Numeric and !@missing_values.include? v}
592
+ false
593
+ else
594
+ true
595
+ end
543
596
  end
544
597
 
545
598
  def to_s
@@ -560,13 +613,13 @@ module Statsample
560
613
  end
561
614
  # Retrieves uniques values for data.
562
615
  def factors
563
- if @type==:scale
564
- @scale_data.uniq.sort
565
- elsif @type==:date
566
- @date_data_with_nils.uniq.sort
567
- else
568
- @valid_data.uniq.sort
569
- end
616
+ if @type==:scale
617
+ @scale_data.uniq.sort
618
+ elsif @type==:date
619
+ @date_data_with_nils.uniq.sort
620
+ else
621
+ @valid_data.uniq.sort
622
+ end
570
623
  end
571
624
  if Statsample::STATSAMPLE__.respond_to?(:frequencies)
572
625
  # Returns a hash with the distribution of frecuencies for
@@ -579,6 +632,8 @@ module Statsample
579
632
  _frequencies
580
633
  end
581
634
  end
635
+
636
+
582
637
  def _frequencies #:nodoc:
583
638
  @valid_data.inject(Hash.new) {|a,x|
584
639
  a[x]||=0
@@ -589,7 +644,7 @@ module Statsample
589
644
 
590
645
  # Returns the most frequent item.
591
646
  def mode
592
- frequencies.max{|a,b| a[1]<=>b[1]}[0]
647
+ frequencies.max{|a,b| a[1]<=>b[1]}.first
593
648
  end
594
649
  # The numbers of item with valid data.
595
650
  def n_valid
@@ -678,22 +733,17 @@ module Statsample
678
733
  # Return the median (percentil 50)
679
734
  def median
680
735
  check_type :ordinal
681
- if Statsample.has_gsl? and @type==:scale
682
- sorted=GSL::Vector.alloc(@scale_data.sort)
683
- GSL::Stats::median_from_sorted_data(sorted)
684
- else
685
- percentil(50)
686
- end
736
+ percentil(50)
687
737
  end
688
738
  # Minimun value
689
739
  def min
690
740
  check_type :ordinal
691
- @valid_data.min;
741
+ @valid_data.min
692
742
  end
693
743
  # Maximum value
694
744
  def max
695
745
  check_type :ordinal
696
- @valid_data.max;
746
+ @valid_data.max
697
747
  end
698
748
 
699
749
  def set_date_data
@@ -722,9 +772,6 @@ module Statsample
722
772
  x.to_f
723
773
  end
724
774
  end
725
- if Statsample.has_gsl?
726
- @gsl=GSL::Vector.alloc(@scale_data) if @scale_data.size>0
727
- end
728
775
  end
729
776
 
730
777
  private :set_date_data, :set_scale_data
@@ -791,7 +838,6 @@ module Statsample
791
838
  # Sample Standard deviation (denominator n-1)
792
839
  def standard_deviation_sample(m=nil)
793
840
  check_type :scale
794
-
795
841
  m||=mean
796
842
  Math::sqrt(variance_sample(m))
797
843
  end
@@ -816,76 +862,30 @@ module Statsample
816
862
  check_type :scale
817
863
  @scale_data.inject(1){|a,x| a*x }
818
864
  end
819
- if Statsample.has_gsl?
820
- %w{skew kurtosis variance_sample standard_deviation_sample variance_population standard_deviation_population mean sum}.each{|m|
821
- m_nuevo=(m+"_slow").intern
822
- alias_method m_nuevo, m.intern
823
- }
824
- def sum # :nodoc:
825
- check_type :scale
826
-
827
- @gsl.sum
828
- end
829
- def mean # :nodoc:
865
+
866
+ # With a fixnum, creates X bins within the range of data
867
+ # With an Array, each value will be a cut point
868
+ def histogram(bins=10)
830
869
  check_type :scale
831
- @gsl.nil? ? nil : @gsl.mean
832
- end
833
- def variance_sample(m=nil) # :nodoc:
834
- check_type :scale
835
- m||=mean
836
- @gsl.variance_m
837
- end
838
- def standard_deviation_sample(m=nil) # :nodoc:
839
- check_type :scale
840
- return nil if @gsl.nil?
841
- m||=mean
842
- @gsl.sd(m)
843
- end
844
870
 
845
- def variance_population(m=nil) # :nodoc:
846
- check_type :scale
847
- m||=mean
848
- @gsl.variance_with_fixed_mean(m)
849
- end
850
- def standard_deviation_population(m=nil) # :nodoc:
851
- check_type :scale
852
- m||=mean
853
- @gsl.sd_with_fixed_mean(m)
854
- end
855
- def skew # :nodoc:
856
- check_type :scale
857
- @gsl.skew
858
- end
859
- def kurtosis # :nodoc:
860
- check_type :scale
861
- @gsl.kurtosis
862
- end
863
- # Create a GSL::Histogram
864
- # With a fixnum, creates X bins within the range of data
865
- # With an Array, each value will be a cut point
866
- def histogram(bins=10)
867
- check_type :scale
868
-
869
- if bins.is_a? Array
870
- #h=Statsample::Histogram.new(self, bins)
871
- h=Statsample::Histogram.alloc(bins)
872
- else
873
- # ugly patch. The upper limit for a bin has the form
874
- # x < range
875
- #h=Statsample::Histogram.new(self, bins)
876
- min,max=Statsample::Util.nice(@valid_data.min,@valid_data.max)
877
- # fix last data
878
- if max==@valid_data.max
879
- max+=1e-10
880
- end
881
- h=Statsample::Histogram.alloc(bins,[min,max])
882
- # Fix last bin
883
-
871
+ if bins.is_a? Array
872
+ #h=Statsample::Histogram.new(self, bins)
873
+ h=Statsample::Histogram.alloc(bins)
874
+ else
875
+ # ugly patch. The upper limit for a bin has the form
876
+ # x < range
877
+ #h=Statsample::Histogram.new(self, bins)
878
+ min,max=Statsample::Util.nice(@valid_data.min,@valid_data.max)
879
+ # fix last data
880
+ if max==@valid_data.max
881
+ max+=1e-10
884
882
  end
885
- h.increment(@valid_data)
886
- h
883
+ h=Statsample::Histogram.alloc(bins,[min,max])
884
+ # Fix last bin
885
+
887
886
  end
888
-
887
+ h.increment(@valid_data)
888
+ h
889
889
  end
890
890
 
891
891
  # Coefficient of variation
@@ -894,7 +894,6 @@ module Statsample
894
894
  check_type :scale
895
895
  standard_deviation_sample.quo(mean)
896
896
  end
897
-
898
897
  alias_method :sdp, :standard_deviation_population
899
898
  alias_method :sds, :standard_deviation_sample
900
899
  alias_method :adp, :average_deviation_population
@@ -902,5 +901,6 @@ module Statsample
902
901
  alias_method :variance, :variance_sample
903
902
  alias_method :sd, :standard_deviation_sample
904
903
  alias_method :ss, :sum_of_squares
904
+ include_aliasing Statsample::Vector::GSL_ if Statsample.has_gsl?
905
905
  end
906
906
  end