statsample 1.5.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. checksums.yaml +4 -4
  2. data/.build.sh +15 -0
  3. data/.gitignore +1 -0
  4. data/.travis.yml +19 -7
  5. data/CONTRIBUTING.md +33 -0
  6. data/History.txt +5 -0
  7. data/README.md +41 -53
  8. data/benchmarks/correlation_matrix_15_variables.rb +6 -5
  9. data/benchmarks/correlation_matrix_5_variables.rb +6 -5
  10. data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +23 -26
  11. data/examples/boxplot.rb +17 -5
  12. data/examples/correlation_matrix.rb +36 -7
  13. data/examples/dataset.rb +25 -5
  14. data/examples/dominance_analysis.rb +8 -7
  15. data/examples/dominance_analysis_bootstrap.rb +16 -11
  16. data/examples/histogram.rb +16 -2
  17. data/examples/icc.rb +5 -6
  18. data/examples/levene.rb +17 -3
  19. data/examples/multiple_regression.rb +6 -3
  20. data/examples/parallel_analysis.rb +11 -6
  21. data/examples/polychoric.rb +26 -13
  22. data/examples/principal_axis.rb +8 -4
  23. data/examples/reliability.rb +10 -10
  24. data/examples/scatterplot.rb +8 -0
  25. data/examples/t_test.rb +7 -0
  26. data/examples/u_test.rb +10 -2
  27. data/examples/vector.rb +9 -6
  28. data/examples/velicer_map_test.rb +12 -8
  29. data/lib/statsample.rb +13 -47
  30. data/lib/statsample/analysis/suite.rb +1 -1
  31. data/lib/statsample/anova/oneway.rb +6 -6
  32. data/lib/statsample/anova/twoway.rb +26 -24
  33. data/lib/statsample/bivariate.rb +78 -61
  34. data/lib/statsample/bivariate/pearson.rb +2 -2
  35. data/lib/statsample/codification.rb +45 -32
  36. data/lib/statsample/converter/csv.rb +15 -53
  37. data/lib/statsample/converter/spss.rb +6 -5
  38. data/lib/statsample/converters.rb +50 -211
  39. data/lib/statsample/crosstab.rb +26 -25
  40. data/lib/statsample/daru.rb +117 -0
  41. data/lib/statsample/dataset.rb +70 -942
  42. data/lib/statsample/dominanceanalysis.rb +16 -17
  43. data/lib/statsample/dominanceanalysis/bootstrap.rb +26 -28
  44. data/lib/statsample/factor/parallelanalysis.rb +17 -19
  45. data/lib/statsample/factor/pca.rb +21 -20
  46. data/lib/statsample/factor/principalaxis.rb +3 -3
  47. data/lib/statsample/graph/boxplot.rb +8 -16
  48. data/lib/statsample/graph/histogram.rb +4 -4
  49. data/lib/statsample/graph/scatterplot.rb +8 -7
  50. data/lib/statsample/histogram.rb +128 -119
  51. data/lib/statsample/matrix.rb +20 -16
  52. data/lib/statsample/multiset.rb +39 -38
  53. data/lib/statsample/regression.rb +3 -3
  54. data/lib/statsample/regression/multiple.rb +8 -10
  55. data/lib/statsample/regression/multiple/alglibengine.rb +96 -89
  56. data/lib/statsample/regression/multiple/baseengine.rb +32 -32
  57. data/lib/statsample/regression/multiple/gslengine.rb +33 -36
  58. data/lib/statsample/regression/multiple/matrixengine.rb +7 -9
  59. data/lib/statsample/regression/multiple/rubyengine.rb +39 -41
  60. data/lib/statsample/reliability.rb +23 -25
  61. data/lib/statsample/reliability/icc.rb +8 -7
  62. data/lib/statsample/reliability/multiscaleanalysis.rb +14 -12
  63. data/lib/statsample/reliability/scaleanalysis.rb +58 -60
  64. data/lib/statsample/reliability/skillscaleanalysis.rb +34 -29
  65. data/lib/statsample/resample.rb +1 -1
  66. data/lib/statsample/shorthand.rb +29 -25
  67. data/lib/statsample/test/kolmogorovsmirnov.rb +5 -3
  68. data/lib/statsample/test/levene.rb +28 -27
  69. data/lib/statsample/test/t.rb +7 -9
  70. data/lib/statsample/test/umannwhitney.rb +28 -28
  71. data/lib/statsample/test/wilcoxonsignedrank.rb +45 -43
  72. data/lib/statsample/vector.rb +70 -1013
  73. data/lib/statsample/version.rb +1 -1
  74. data/statsample.gemspec +12 -16
  75. data/test/helpers_tests.rb +1 -1
  76. data/test/test_analysis.rb +17 -17
  77. data/test/test_anova_contrast.rb +6 -6
  78. data/test/test_anovatwowaywithdataset.rb +8 -8
  79. data/test/test_anovawithvectors.rb +8 -8
  80. data/test/test_awesome_print_bug.rb +1 -1
  81. data/test/test_bartlettsphericity.rb +4 -4
  82. data/test/test_bivariate.rb +48 -43
  83. data/test/test_codification.rb +33 -33
  84. data/test/test_crosstab.rb +9 -9
  85. data/test/test_dataset.rb +28 -458
  86. data/test/test_factor.rb +46 -38
  87. data/test/test_factor_pa.rb +22 -13
  88. data/test/test_ggobi.rb +4 -4
  89. data/test/test_gsl.rb +4 -4
  90. data/test/test_histogram.rb +3 -3
  91. data/test/test_matrix.rb +13 -13
  92. data/test/test_multiset.rb +103 -91
  93. data/test/test_regression.rb +57 -52
  94. data/test/test_reliability.rb +55 -45
  95. data/test/test_reliability_icc.rb +8 -8
  96. data/test/test_reliability_skillscale.rb +26 -24
  97. data/test/test_resample.rb +1 -1
  98. data/test/test_statistics.rb +3 -13
  99. data/test/test_stest.rb +9 -9
  100. data/test/test_stratified.rb +3 -3
  101. data/test/test_test_t.rb +12 -12
  102. data/test/test_umannwhitney.rb +2 -2
  103. data/test/test_vector.rb +76 -613
  104. data/test/test_wilcoxonsignedrank.rb +4 -4
  105. metadata +57 -28
  106. data/lib/statsample/rserve_extension.rb +0 -20
  107. data/lib/statsample/vector/gsl.rb +0 -106
  108. data/test/fixtures/repeated_fields.csv +0 -7
  109. data/test/fixtures/scientific_notation.csv +0 -4
  110. data/test/fixtures/test_csv.csv +0 -7
  111. data/test/fixtures/test_xls.xls +0 -0
  112. data/test/test_csv.rb +0 -63
  113. data/test/test_rserve_extension.rb +0 -42
  114. data/test/test_xls.rb +0 -52
@@ -8,13 +8,13 @@ module Statsample
8
8
 
9
9
  # Name of F analysis
10
10
  attr_accessor :name
11
- attr_reader :w
12
- attr_reader :nr
13
- attr_writer :tails
11
+ attr_reader :w
12
+ attr_reader :nr
13
+ attr_writer :tails
14
14
  # Parameters:
15
15
  def initialize(v1,v2, opts=Hash.new)
16
- @v1=v1
17
- @v2=v2
16
+ @v1 = v1
17
+ @v2 = v2
18
18
  opts_default={:name=>_("Wilcoxon Signed Rank Test"),:tails=>:both}
19
19
  @opts=opts_default.merge(opts)
20
20
  opts_default.keys.each {|k|
@@ -22,66 +22,68 @@ module Statsample
22
22
  }
23
23
  calculate
24
24
  end
25
+
25
26
  def calculate
26
- df=Statsample::Dataset.new({'v1'=>@v1,'v2'=>@v2})
27
- df["abs"]=df.collect {|row|
28
- r=(row["v2"]-row["v1"]).abs
29
- }
30
- df["sgn"]=df.collect {|row|
31
- r=row["v2"]-row["v1"]
32
- r==0 ? 0 : r/r.abs
33
- }
34
- df=df.filter {|row| row["sgn"]!=0}
35
- df["rank"]=df["abs"].ranked
36
- @nr=df.cases
37
- @w=df.collect {|row|
38
- row["sgn"]*row["rank"]
39
- #p row["sgn"]*row["rank"]
40
- }.sum
27
+ df = Daru::DataFrame.new({:v1 => @v1,:v2 => @v2})
28
+ # df[:abs]=df.collect(:row) { |row| (row[:v2] - row[:v1]).abs }
29
+ df[:abs] = (df[:v2] - df[:v1]).abs
30
+ df[:sgn] = df.collect(:row) { |row|
31
+ r = row[:v2] - row[:v1]
32
+ r == 0 ? 0 : r/r.abs
33
+ }
34
+ df = df.filter_rows { |row| row[:sgn] != 0}
35
+ df[:rank] = df[:abs].ranked
36
+ @nr = df.nrows
37
+
38
+ @w = df.collect(:row) { |row|
39
+ row[:sgn] * row[:rank]
40
+ }.sum
41
41
  end
42
+
42
43
  def report_building(generator) # :nodoc:
43
44
  generator.section(:name=>@name) do |s|
44
45
  s.table(:name=>_("%s results") % @name) do |t|
45
46
  t.row([_("W Value"), "%0.3f" % @w])
46
47
  t.row([_("Z"), "%0.3f (p: %0.3f)" % [z, probability_z]])
47
48
  if(nr<=10)
48
- t.row([_("Exact probability"), "p-exact: %0.3f" % [probability_exact]])
49
+ t.row([_("Exact probability"), "p-exact: %0.3f" % [probability_exact]])
49
50
  end
50
51
  end
51
52
  end
52
53
  end
53
54
  def z
54
- sigma=Math.sqrt((nr*(nr+1)*(2*nr+1))/6)
55
- (w-0.5)/sigma
55
+ sigma=Math.sqrt((nr*(nr+1)*(2*nr+1))/6)
56
+ (w-0.5)/sigma
56
57
  end
57
58
  # Assuming normal distribution of W, this calculate
58
59
  # the probability of samples with Z equal or higher than
59
60
  # obtained on sample
60
61
  def probability_z
61
- (1-Distribution::Normal.cdf(z))*(@tails==:both ? 2:1)
62
+ (1-Distribution::Normal.cdf(z))*(@tails==:both ? 2:1)
62
63
  end
63
64
  # Calculate exact probability.
64
65
  # Don't calculate for large Nr, please!
65
66
  def probability_exact
66
- str_format="%0#{nr}b"
67
- combinations=2**nr
68
- #p str_format
69
- total_w=combinations.times.map {|i|
70
- comb=sprintf(str_format,i)
71
- w_local=comb.length.times.inject(0) {|ac,j|
72
- sgn=comb[j]=="0" ? -1 : 1
73
- ac+(j+1)*sgn
74
- }
75
- }.sort
76
- total_w.find_all {|v|
77
- if @tails==:both
78
- v<=-w.abs or v>=w.abs
79
- elsif @tails==:left
80
- v<=w
81
- elsif @tails==:right
82
- v>=w
83
- end
84
- }.count/(combinations.to_f)
67
+ str_format="%0#{nr}b"
68
+ combinations=2**nr
69
+ #p str_format
70
+ total_w=combinations.times.map do |i|
71
+ comb=sprintf(str_format,i)
72
+ w_local=comb.length.times.inject(0) do |ac,j|
73
+ sgn=comb[j]=="0" ? -1 : 1
74
+ ac+(j+1)*sgn
75
+ end
76
+ end.sort
77
+
78
+ total_w.find_all do |v|
79
+ if @tails==:both
80
+ v<=-w.abs or v>=w.abs
81
+ elsif @tails==:left
82
+ v<=w
83
+ elsif @tails==:right
84
+ v>=w
85
+ end
86
+ end.count/(combinations.to_f)
85
87
  end
86
88
  end
87
89
  end
@@ -1,22 +1,18 @@
1
- require 'date'
2
- require 'statsample/vector/gsl'
3
-
4
1
  module Statsample::VectorShorthands
5
2
  # Creates a new Statsample::Vector object
6
3
  # Argument should be equal to Vector.new
7
4
  def to_vector(*args)
8
- Statsample::Vector.new(self,*args)
5
+ Statsample::Vector.new(self)
9
6
  end
10
7
 
11
- # Creates a new Statsample::Vector object of type :scale.
8
+ # Creates a new Daru::Vector object of type :scale.
12
9
  # Deprecated. Use to_numeric instead.
13
10
  def to_scale(*args)
14
- $stderr.puts "WARNING: to_scale has been deprecated. Use to_numeric instead."
15
- Statsample::Vector.new(self, :numeric, *args)
11
+ Statsample::Vector.new(self, *args)
16
12
  end
17
13
 
18
14
  def to_numeric(*args)
19
- Statsample::Vector.new(self, :numeric, *args)
15
+ Statsample::Vector.new(self)
20
16
  end
21
17
  end
22
18
 
@@ -39,1057 +35,118 @@ module Statsample
39
35
  # == Usage
40
36
  # The fast way to create a vector uses Array.to_vector or Array.to_numeric.
41
37
  #
42
- # v=[1,2,3,4].to_vector(:numeric)
43
- # v=[1,2,3,4].to_numeric
44
- #
45
- class Vector
46
- include Enumerable
47
- include Writable
48
- include Summarizable
38
+ # == Deprecation Warning
39
+ #
40
+ # Statsample::Vector has been deprecated in favour of Daru::Vector. Daru is
41
+ # a dedicated data analysis and manipulation library that brings awesome
42
+ # data analysis functionality to ruby. Check out the daru docs at
43
+ # https://github.com/v0dro/daru#notebooks
44
+ class Vector < Daru::Vector
49
45
  include Statsample::VectorShorthands
50
46
 
51
- # Level of measurement. Could be :object, :numeric
52
- attr_reader :type
53
- # Original data.
54
- attr_reader :data
55
- # Valid data. Equal to data, minus values assigned as missing values
56
- attr_reader :valid_data
57
- # Array of values considered as missing. Nil is a missing value, by default
58
- attr_reader :missing_values
59
- # Array of values considered as "Today", with date type. "NOW", "TODAY", :NOW and :TODAY are 'today' values, by default
60
- attr_reader :today_values
47
+ # Valid data. Equal to data, minus values assigned as missing values.
48
+ #
49
+ # == Deprecation Warning
50
+ #
51
+ # Use Daru::Vector#only_valid instead of this method.
52
+ def valid_data
53
+ $stderr.puts "WARNING: valid_data in Statsample::Vector has been deprecated in favor of only_valid in Daru::Vector. Please use that.\n"
54
+ only_valid.to_a
55
+ end
61
56
  # Missing values array
62
- attr_reader :missing_data
63
- # Original data, with all missing values replaced by nils
64
- attr_reader :data_with_nils
65
- # Date date, with all missing values replaced by nils
66
- attr_reader :date_data_with_nils
67
- # Change label for specific values
68
- attr_accessor :labels
69
- # Name of vector. Should be used for output by many classes
70
- attr_accessor :name
57
+ #
58
+ # == Deprecation Warning
59
+ #
60
+ # Use Daru::Vector#only_valid instead of this method.
61
+ def missing_data
62
+ only_missing.to_a
63
+ end
64
+ # Original data.
65
+ #
66
+ # == Deprecation Warning
67
+ #
68
+ # Use Daru::Vector#to_a instead of this method.
69
+ def data_with_nils
70
+ to_a
71
+ end
72
+
73
+ def type= val
74
+ raise NoMethodError, "Daru::Vector automatically figures the type of data. There is no need to assign it anymore."
75
+ end
76
+
77
+ def initialize(data=[], type=:object, opts=Hash.new)
78
+ $stderr.puts "WARNING: Statsample::Dataset and Statsample::Vector have been deprecated in favor of Daru::DataFrame and Daru::Vector. Please switch to using that."
71
79
 
72
- # Creates a new Vector object.
73
- # * <tt>data</tt> Any data which can be converted on Array
74
- # * <tt>type</tt> Level of meausurement. See Vector#type
75
- # * <tt>opts</tt> Hash of options
76
- # * <tt>:missing_values</tt> Array of missing values. See Vector#missing_values
77
- # * <tt>:today_values</tt> Array of 'today' values. See Vector#today_values
78
- # * <tt>:labels</tt> Labels for data values
79
- # * <tt>:name</tt> Name of vector
80
- def initialize(data=[], type=:object, opts=Hash.new)
81
80
  if type == :ordinal or type == :scale
82
- $stderr.puts "WARNING: #{type} has been deprecated. Use :numeric instead."
83
- type = :numeric
81
+ $stderr.puts "WARNING: #{type} has been deprecated."
84
82
  end
85
83
 
86
84
  if type == :nominal
87
- $stderr.puts "WARNING: nominal has been deprecated. Use :object instead."
88
- type = :object
85
+ $stderr.puts "WARNING: nominal has been deprecated."
89
86
  end
90
87
 
91
- @data=data.is_a?(Array) ? data : data.to_a
92
- @type=type
93
- opts_default={
94
- :missing_values=>[],
95
- :today_values=>['NOW','TODAY', :NOW, :TODAY],
96
- :labels=>{},
97
- :name=>nil
98
- }
99
- @opts=opts_default.merge(opts)
100
- if @opts[:name].nil?
88
+ if opts[:today_values]
89
+ raise ArgumentError, "This option is no longer supported in Vector. Watch out for the next version of Daru::Vector that will have full time series support"
90
+ end
91
+
92
+ if opts[:name].nil?
101
93
  @@n_table||=0
102
94
  @@n_table+=1
103
- @opts[:name]="Vector #{@@n_table}"
95
+ opts[:name] = "Vector #{@@n_table}"
104
96
  end
105
- @missing_values=@opts[:missing_values]
106
- @labels=@opts[:labels]
107
- @today_values=@opts[:today_values]
108
- @name=@opts[:name]
109
- @valid_data=[]
110
- @data_with_nils=[]
111
- @date_data_with_nils=[]
112
- @missing_data=[]
113
- @has_missing_data=nil
114
- @numeric_data=nil
115
- set_valid_data
116
- self.type=type
97
+
98
+ super(data, opts)
117
99
  end
100
+
118
101
  # Create a vector using (almost) any object
119
102
  # * Array: flattened
120
103
  # * Range: transformed using to_a
121
104
  # * Statsample::Vector
122
105
  # * Numeric and string values
106
+ #
107
+ # == Deprecation Warning
108
+ #
109
+ # Statsample::Vector is to be replaced by Daru::Vector soon. Use the
110
+ # equivalent method Daru::Vector.[] for this purpose.
123
111
  def self.[](*args)
124
- values=[]
125
- args.each do |a|
126
- case a
127
- when Array
128
- values.concat a.flatten
129
- when Statsample::Vector
130
- values.concat a.to_a
131
- when Range
132
- values.concat a.to_a
133
- else
134
- values << a
135
- end
136
- end
137
- vector=new(values)
138
- vector.type=:numeric if vector.can_be_numeric?
139
- vector
112
+ $stderr.puts "WARNING: Statsample::Dataset and Statsample::Vector have been deprecated in favor of Daru::DataFrame and Daru::Vector. Please switch to using that."
113
+ super *args
140
114
  end
115
+
141
116
  # Create a new numeric type vector
142
117
  # Parameters
143
118
  # [n] Size
144
119
  # [val] Value of each value
145
120
  # [&block] If block provided, is used to set the values of vector
121
+ #
122
+ # == Deprecation Warning
123
+ #
124
+ # Statsample::Vector is to be replaced by Daru::Vector soon. Use the
125
+ # equivalent method Daru::Vector.[] for this purpose.
146
126
  def self.new_numeric(n,val=nil, &block)
147
127
  if block
148
- vector=n.times.map {|i| block.call(i)}.to_numeric
128
+ Statsample::Vector.new(n.times.map {|i| block.call(i)})
149
129
  else
150
- vector=n.times.map { val}.to_numeric
130
+ Statsample::Vector.new(n.times.map { val })
151
131
  end
152
- vector.type=:numeric
153
- vector
154
132
  end
155
133
 
156
134
  # Deprecated. Use new_numeric instead.
157
135
  def self.new_scale(n, val=nil,&block)
158
- $stderr.puts "WARNING: .new_scale has been deprecated. Use .new_numeric instead."
159
136
  new_numeric n, val, &block
160
137
  end
161
- # Creates a duplicate of the Vector.
162
- # Note: data, missing_values and labels are duplicated, so
163
- # changes on original vector doesn't propages to copies.
164
- def dup
165
- Vector.new(@data.dup,@type, :missing_values => @missing_values.dup, :labels => @labels.dup, :name=>@name)
166
- end
167
- # Returns an empty duplicate of the vector. Maintains the type,
168
- # missing values and labels.
169
- def dup_empty
170
- Vector.new([],@type, :missing_values => @missing_values.dup, :labels => @labels.dup, :name=> @name)
171
- end
172
-
173
- if Statsample::STATSAMPLE__.respond_to?(:check_type)
174
- # Raises an exception if type of vector is inferior to t type
175
- def check_type(t)
176
- Statsample::STATSAMPLE__.check_type(self,t)
177
- end
178
- else
179
- def check_type(t) #:nodoc:
180
- _check_type(t)
181
- end
182
- end
183
-
184
-
185
- def _check_type(t) #:nodoc:
186
- raise NoMethodError if (t == :numeric and @type == :object) or
187
- (t == :date) or (:date == @type)
188
- end
189
-
190
- def vector_standarized_compute(m,sd) # :nodoc:
191
- @data_with_nils.collect{|x| x.nil? ? nil : (x.to_f - m).quo(sd) }.to_vector(:numeric)
192
- end
193
- # Return a vector usign the standarized values for data
194
- # with sd with denominator n-1. With variance=0 or mean nil,
195
- # returns a vector of equal size full of nils
196
- #
197
- def vector_standarized(use_population=false)
198
- check_type :numeric
199
- m=mean
200
- sd=use_population ? sdp : sds
201
- return ([nil]*size).to_numeric if mean.nil? or sd==0.0
202
- vector=vector_standarized_compute(m,sd)
203
- vector.name=_("%s(standarized)") % @name
204
- vector
205
- end
206
- def vector_centered_compute(m) #:nodoc:
207
- @data_with_nils.collect {|x| x.nil? ? nil : x.to_f-m }.to_numeric
208
- end
209
- # Return a centered vector
210
- def vector_centered
211
- check_type :numeric
212
- m=mean
213
- return ([nil]*size).to_numeric if mean.nil?
214
- vector=vector_centered_compute(m)
215
- vector.name=_("%s(centered)") % @name
216
- vector
217
- end
218
-
219
- alias_method :standarized, :vector_standarized
220
- alias_method :centered, :vector_centered
221
- # Return a vector with values replaced with the percentiles
222
- # of each values
223
- def vector_percentil
224
- check_type :numeric
225
- c=@valid_data.size
226
- vector=ranked.map {|i| i.nil? ? nil : (i.quo(c)*100).to_f }.to_vector(@type)
227
- vector.name=_("%s(percentil)") % @name
228
- vector
229
- end
230
- def box_cox_transformation(lambda) # :nodoc:
231
- raise "Should be a numeric" unless @type==:numeric
232
- @data_with_nils.collect{|x|
233
- if !x.nil?
234
- if(lambda==0)
235
- Math.log(x)
236
- else
237
- (x**lambda-1).quo(lambda)
238
- end
239
- else
240
- nil
241
- end
242
- }.to_vector(:numeric)
243
- end
244
-
245
- # Vector equality.
246
- # Two vector will be the same if their data, missing values, type, labels are equals
247
- def ==(v2)
248
- return false unless v2.instance_of? Statsample::Vector
249
- @data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels==v2.labels
250
- end
251
-
252
- def _dump(i) # :nodoc:
253
- Marshal.dump({'data'=>@data,'missing_values'=>@missing_values, 'labels'=>@labels, 'type'=>@type,'name'=>@name})
254
- end
255
-
256
- def self._load(data) # :nodoc:
257
- h=Marshal.load(data)
258
- Vector.new(h['data'], h['type'], :missing_values=> h['missing_values'], :labels=>h['labels'], :name=>h['name'])
259
- end
260
- # Returns a new vector, with data modified by block.
261
- # Equivalent to create a Vector after #collect on data
262
- def recode(type=nil)
263
- type||=@type
264
- @data.collect{|x|
265
- yield x
266
- }.to_vector(type)
267
- end
268
- # Modifies current vector, with data modified by block.
269
- # Equivalent to #collect! on @data
270
- def recode!
271
- @data.collect!{|x|
272
- yield x
273
- }
274
- set_valid_data
275
- end
276
- def push(v)
277
- @data.push(v)
278
- set_valid_data
279
- end
280
-
281
- # Dicotomize the vector with 0 and 1, based on lowest value
282
- # If parameter if defined, this value and lower
283
- # will be 0 and higher, 1
284
- def dichotomize(low = nil)
285
- low ||= factors.min
286
-
287
- @data_with_nils.collect do |x|
288
- if x.nil?
289
- nil
290
- elsif x > low
291
- 1
292
- else
293
- 0
294
- end
295
- end.to_numeric
296
- end
297
- # Iterate on each item.
298
- # Equivalent to
299
- # @data.each{|x| yield x}
300
- def each
301
- @data.each{|x| yield(x) }
302
- end
303
-
304
- # Iterate on each item, retrieving index
305
- def each_index
306
- (0...@data.size).each {|i|
307
- yield(i)
308
- }
309
- end
310
- # Add a value at the end of the vector.
311
- # If second argument set to false, you should update the Vector usign
312
- # Vector.set_valid_data at the end of your insertion cycle
313
- #
314
- def add(v,update_valid=true)
315
- @data.push(v)
316
- set_valid_data if update_valid
317
- end
318
- # Update valid_data, missing_data, data_with_nils and gsl
319
- # at the end of an insertion.
320
- #
321
- # Use after Vector.add(v,false)
322
- # Usage:
323
- # v=Statsample::Vector.new
324
- # v.add(2,false)
325
- # v.add(4,false)
326
- # v.data
327
- # => [2,3]
328
- # v.valid_data
329
- # => []
330
- # v.set_valid_data
331
- # v.valid_data
332
- # => [2,3]
333
- def set_valid_data
334
- @valid_data.clear
335
- @missing_data.clear
336
- @data_with_nils.clear
337
- @date_data_with_nils.clear
338
- set_valid_data_intern
339
- set_numeric_data if(@type==:numeric)
340
- set_date_data if(@type==:date)
341
- end
342
- if Statsample::STATSAMPLE__.respond_to?(:set_valid_data_intern)
343
- def set_valid_data_intern #:nodoc:
344
- Statsample::STATSAMPLE__.set_valid_data_intern(self)
345
- end
346
- else
347
- def set_valid_data_intern #:nodoc:
348
- _set_valid_data_intern
349
- end
350
- end
351
- def _set_valid_data_intern #:nodoc:
352
- @data.each do |n|
353
- if is_valid? n
354
- @valid_data.push(n)
355
- @data_with_nils.push(n)
356
- else
357
- @data_with_nils.push(nil)
358
- @missing_data.push(n)
359
- end
360
- end
361
- @has_missing_data=@missing_data.size>0
362
- end
363
-
364
- # Retrieves true if data has one o more missing values
365
- def has_missing_data?
366
- @has_missing_data
367
- end
368
- alias :flawed? :has_missing_data?
369
-
370
- # Retrieves label for value x. Retrieves x if
371
- # no label defined.
372
- def labeling(x)
373
- @labels.has_key?(x) ? @labels[x].to_s : x.to_s
374
- end
375
- alias :label :labeling
376
- # Returns a Vector with data with labels replaced by the label.
377
- def vector_labeled
378
- d=@data.collect{|x|
379
- if @labels.has_key? x
380
- @labels[x]
381
- else
382
- x
383
- end
384
- }
385
- Vector.new(d,@type)
386
- end
387
- # Size of total data
388
- def size
389
- @data.size
390
- end
391
- alias_method :n, :size
392
-
393
- # Retrieves i element of data
394
- def [](i)
395
- @data[i]
396
- end
397
- # Set i element of data.
398
- # Note: Use set_valid_data if you include missing values
399
- def []=(i,v)
400
- @data[i]=v
401
- end
402
- # Return true if a value is valid (not nil and not included on missing values)
403
- def is_valid?(x)
404
- !(x.nil? or @missing_values.include? x)
405
- end
406
- # Set missing_values.
407
- # set_valid_data is called after changes
408
- def missing_values=(vals)
409
- @missing_values = vals
410
- set_valid_data
411
- end
412
- # Set data considered as "today" on data vectors
413
- def today_values=(vals)
414
- @today_values = vals
415
- set_valid_data
416
- end
417
- # Set level of measurement.
418
- def type=(t)
419
- @type=t
420
- set_numeric_data if(t==:numeric)
421
- set_date_data if (t==:date)
422
- end
423
- def to_a
424
- if @data.is_a? Array
425
- @data.dup
426
- else
427
- @data.to_a
428
- end
429
- end
430
- alias_method :to_ary, :to_a
431
-
432
- # Vector sum.
433
- # - If v is a scalar, add this value to all elements
434
- # - If v is a Array or a Vector, should be of the same size of this vector
435
- # every item of this vector will be added to the value of the
436
- # item at the same position on the other vector
437
- def +(v)
438
- _vector_ari("+",v)
439
- end
440
- # Vector rest.
441
- # - If v is a scalar, rest this value to all elements
442
- # - If v is a Array or a Vector, should be of the same
443
- # size of this vector
444
- # every item of this vector will be rested to the value of the
445
- # item at the same position on the other vector
446
-
447
- def -(v)
448
- _vector_ari("-",v)
449
- end
450
-
451
- def *(v)
452
- _vector_ari("*",v)
453
- end
454
- # Reports all values that doesn't comply with a condition.
455
- # Returns a hash with the index of data and the invalid data.
456
- def verify
457
- h={}
458
- (0...@data.size).to_a.each{|i|
459
- if !(yield @data[i])
460
- h[i]=@data[i]
461
- end
462
- }
463
- h
464
- end
465
- def _vector_ari(method,v) # :nodoc:
466
- if(v.is_a? Vector or v.is_a? Array)
467
- raise ArgumentError, "The array/vector parameter (#{v.size}) should be of the same size of the original vector (#{@data.size})" unless v.size==@data.size
468
- sum=[]
469
- v.size.times {|i|
470
- if((v.is_a? Vector and v.is_valid?(v[i]) and is_valid?(@data[i])) or (v.is_a? Array and !v[i].nil? and !data[i].nil?))
471
- sum.push(@data[i].send(method,v[i]))
472
- else
473
- sum.push(nil)
474
- end
475
- }
476
- Statsample::Vector.new(sum, :numeric)
477
- elsif(v.respond_to? method )
478
- Statsample::Vector.new(
479
- @data.collect {|x|
480
- if(!x.nil?)
481
- x.send(method,v)
482
- else
483
- nil
484
- end
485
- } , :numeric)
486
- else
487
- raise TypeError,"You should pass a scalar or a array/vector"
488
- end
489
-
490
- end
491
- # Return an array with the data splitted by a separator.
492
- # a=Vector.new(["a,b","c,d","a,b","d"])
493
- # a.splitted
494
- # =>
495
- # [["a","b"],["c","d"],["a","b"],["d"]]
496
- def splitted(sep=Statsample::SPLIT_TOKEN)
497
- @data.collect{|x|
498
- if x.nil?
499
- nil
500
- elsif (x.respond_to? :split)
501
- x.split(sep)
502
- else
503
- [x]
504
- end
505
- }
506
- end
507
- # Returns a hash of Vectors, defined by the different values
508
- # defined on the fields
509
- # Example:
510
- #
511
- # a=Vector.new(["a,b","c,d","a,b"])
512
- # a.split_by_separator
513
- # => {"a"=>#<Statsample::Type::object:0x7f2dbcc09d88
514
- # @data=[1, 0, 1]>,
515
- # "b"=>#<Statsample::Type::object:0x7f2dbcc09c48
516
- # @data=[1, 1, 0]>,
517
- # "c"=>#<Statsample::Type::object:0x7f2dbcc09b08
518
- # @data=[0, 1, 1]>}
519
- #
520
- def split_by_separator(sep=Statsample::SPLIT_TOKEN)
521
- split_data=splitted(sep)
522
- factors=split_data.flatten.uniq.compact
523
- out=factors.inject({}) {|a,x|
524
- a[x]=[]
525
- a
526
- }
527
- split_data.each do |r|
528
- if r.nil?
529
- factors.each do |f|
530
- out[f].push(nil)
531
- end
532
- else
533
- factors.each do |f|
534
- out[f].push(r.include?(f) ? 1:0)
535
- end
536
- end
537
- end
538
- out.inject({}){|s,v|
539
- s[v[0]]=Vector.new(v[1],:object)
540
- s
541
- }
542
- end
543
- def split_by_separator_freq(sep=Statsample::SPLIT_TOKEN)
544
- split_by_separator(sep).inject({}) {|a,v|
545
- a[v[0]]=v[1].inject {|s,x| s+x.to_i}
546
- a
547
- }
548
- end
549
-
550
- # == Bootstrap
551
- # Generate +nr+ resamples (with replacement) of size +s+
552
- # from vector, computing each estimate from +estimators+
553
- # over each resample.
554
- # +estimators+ could be
555
- # a) Hash with variable names as keys and lambdas as values
556
- # a.bootstrap(:log_s2=>lambda {|v| Math.log(v.variance)},1000)
557
- # b) Array with names of method to bootstrap
558
- # a.bootstrap([:mean, :sd],1000)
559
- # c) A single method to bootstrap
560
- # a.jacknife(:mean, 1000)
561
- # If s is nil, is set to vector size by default.
562
- #
563
- # Returns a dataset where each vector is an vector
564
- # of length +nr+ containing the computed resample estimates.
565
- def bootstrap(estimators, nr, s=nil)
566
- s||=n
567
-
568
- h_est, es, bss= prepare_bootstrap(estimators)
569
-
570
-
571
- nr.times do |i|
572
- bs=sample_with_replacement(s)
573
- es.each do |estimator|
574
- # Add bootstrap
575
- bss[estimator].push(h_est[estimator].call(bs))
576
- end
577
- end
578
-
579
- es.each do |est|
580
- bss[est]=bss[est].to_numeric
581
- bss[est].type=:numeric
582
- end
583
- bss.to_dataset
584
-
585
- end
586
-
587
- # == Jacknife
588
- # Returns a dataset with jacknife delete-+k+ +estimators+
589
- # +estimators+ could be:
590
- # a) Hash with variable names as keys and lambdas as values
591
- # a.jacknife(:log_s2=>lambda {|v| Math.log(v.variance)})
592
- # b) Array with method names to jacknife
593
- # a.jacknife([:mean, :sd])
594
- # c) A single method to jacknife
595
- # a.jacknife(:mean)
596
- # +k+ represent the block size for block jacknife. By default
597
- # is set to 1, for classic delete-one jacknife.
598
- #
599
- # Returns a dataset where each vector is an vector
600
- # of length +cases+/+k+ containing the computed jacknife estimates.
601
- #
602
- # == Reference:
603
- # * Sawyer, S. (2005). Resampling Data: Using a Statistical Jacknife.
604
- def jacknife(estimators, k=1)
605
- raise "n should be divisible by k:#{k}" unless n%k==0
606
-
607
- nb=(n / k).to_i
608
-
609
-
610
- h_est, es, ps= prepare_bootstrap(estimators)
611
-
612
- est_n=es.inject({}) {|h,v|
613
- h[v]=h_est[v].call(self)
614
- h
615
- }
616
-
617
138
 
618
- nb.times do |i|
619
- other=@data_with_nils.dup
620
- other.slice!(i*k,k)
621
- other=other.to_numeric
622
- es.each do |estimator|
623
- # Add pseudovalue
624
- ps[estimator].push( nb * est_n[estimator] - (nb-1) * h_est[estimator].call(other))
625
- end
626
- end
627
-
628
-
629
- es.each do |est|
630
- ps[est]=ps[est].to_numeric
631
- ps[est].type=:numeric
632
- end
633
- ps.to_dataset
634
- end
635
-
636
-
637
- # For an array or hash of estimators methods, returns
638
- # an array with three elements
639
- # 1.- A hash with estimators names as keys and lambdas as values
640
- # 2.- An array with estimators names
641
- # 3.- A Hash with estimators names as keys and empty arrays as values
642
- def prepare_bootstrap(estimators)
643
- h_est=estimators
644
-
645
- h_est=[h_est] unless h_est.is_a? Array or h_est.is_a? Hash
646
-
647
- if h_est.is_a? Array
648
- h_est=h_est.inject({}) {|h,est|
649
- h[est]=lambda {|v| v.send(est)}
650
- h
651
- }
652
- end
653
-
654
- bss=h_est.keys.inject({}) {|h,v| h[v]=[];h}
655
-
656
- [h_est,h_est.keys, bss]
657
-
658
- end
659
- private :prepare_bootstrap
660
-
661
- # Returns an random sample of size n, with replacement,
662
- # only with valid data.
663
- #
664
- # In all the trails, every item have the same probability
665
- # of been selected.
666
- def sample_with_replacement(sample=1)
667
- vds=@valid_data.size
668
- (0...sample).collect{ @valid_data[rand(vds)] }
669
- end
670
- # Returns an random sample of size n, without replacement,
671
- # only with valid data.
672
- #
673
- # Every element could only be selected once.
674
- #
675
- # A sample of the same size of the vector is the vector itself.
676
-
677
- def sample_without_replacement(sample=1)
678
- raise ArgumentError, "Sample size couldn't be greater than n" if sample>@valid_data.size
679
- out=[]
680
- size=@valid_data.size
681
- while out.size<sample
682
- value=rand(size)
683
- out.push(value) if !out.include?value
684
- end
685
- out.collect{|i| @data[i]}
686
- end
687
- # Retrieves number of cases which comply condition.
688
- # If block given, retrieves number of instances where
689
- # block returns true.
690
- # If other values given, retrieves the frequency for
691
- # this value.
692
- def count(x=false)
693
- if block_given?
694
- r=@data.inject(0) {|s, i|
695
- r=yield i
696
- s+(r ? 1 : 0)
697
- }
698
- r.nil? ? 0 : r
699
- else
700
- frequencies[x].nil? ? 0 : frequencies[x]
701
- end
702
- end
703
-
704
- # Returns the database type for the vector, according to its content
705
-
706
- def db_type(dbs='mysql')
707
- # first, detect any character not number
708
- if @data.find {|v| v.to_s=~/\d{2,2}-\d{2,2}-\d{4,4}/} or @data.find {|v| v.to_s=~/\d{4,4}-\d{2,2}-\d{2,2}/}
709
- return "DATE"
710
- elsif @data.find {|v| v.to_s=~/[^0-9e.-]/ }
711
- return "VARCHAR (255)"
712
- elsif @data.find {|v| v.to_s=~/\./}
713
- return "DOUBLE"
714
- else
715
- return "INTEGER"
716
- end
717
- end
718
139
  # Return true if all data is Date, "today" values or nil
719
140
  def can_be_date?
720
- if @data.find {|v|
721
- !v.nil? and !v.is_a? Date and !v.is_a? Time and (v.is_a? String and !@today_values.include? v) and (v.is_a? String and !(v=~/\d{4,4}[-\/]\d{1,2}[-\/]\d{1,2}/))}
722
- false
723
- else
724
- true
725
- end
141
+ raise NoMethodError, "This method is no longer supported."
726
142
  end
727
143
  # Return true if all data is Numeric or nil
728
144
  def can_be_numeric?
729
- if @data.find {|v| !v.nil? and !v.is_a? Numeric and !@missing_values.include? v}
730
- false
731
- else
732
- true
733
- end
145
+ type == :numeric
734
146
  end
735
147
 
736
148
  def to_s
737
149
  sprintf("Vector(type:%s, n:%d)[%s]",@type.to_s,@data.size, @data.collect{|d| d.nil? ? "nil":d}.join(","))
738
150
  end
739
- # Ugly name. Really, create a Vector for standard 'matrix' package.
740
- # <tt>dir</tt> could be :horizontal or :vertical
741
- def to_matrix(dir=:horizontal)
742
- case dir
743
- when :horizontal
744
- Matrix[@data]
745
- when :vertical
746
- Matrix.columns([@data])
747
- end
748
- end
749
- def inspect
750
- self.to_s
751
- end
752
- # Retrieves uniques values for data.
753
- def factors
754
- if @type==:numeric
755
- @numeric_data.uniq.sort
756
- elsif @type==:date
757
- @date_data_with_nils.uniq.sort
758
- else
759
- @valid_data.uniq.sort
760
- end
761
- end
762
- if Statsample::STATSAMPLE__.respond_to?(:frequencies)
763
- # Returns a hash with the distribution of frecuencies for
764
- # the sample
765
- def frequencies
766
- Statsample::STATSAMPLE__.frequencies(@valid_data)
767
- end
768
- else
769
- def frequencies #:nodoc:
770
- _frequencies
771
- end
772
- end
773
-
774
-
775
- def _frequencies #:nodoc:
776
- @valid_data.inject(Hash.new) {|a,x|
777
- a[x]||=0
778
- a[x]=a[x]+1
779
- a
780
- }
781
- end
782
-
783
- # Returns the most frequent item.
784
- def mode
785
- frequencies.max{|a,b| a[1]<=>b[1]}.first
786
- end
787
- # The numbers of item with valid data.
788
- def n_valid
789
- @valid_data.size
790
- end
791
- # Returns a hash with the distribution of proportions of
792
- # the sample.
793
- def proportions
794
- frequencies.inject({}){|a,v|
795
- a[v[0]] = v[1].quo(n_valid)
796
- a
797
- }
798
- end
799
- # Proportion of a given value.
800
- def proportion(v=1)
801
- frequencies[v].quo(@valid_data.size)
802
- end
803
- def report_building(b)
804
- b.section(:name=>name) do |s|
805
- s.text _("n :%d") % n
806
- s.text _("n valid:%d") % n_valid
807
- if @type==:object
808
- s.text _("factors:%s") % factors.join(",")
809
- s.text _("mode: %s") % mode
810
-
811
- s.table(:name=>_("Distribution")) do |t|
812
- frequencies.sort.each do |k,v|
813
- key=labels.has_key?(k) ? labels[k]:k
814
- t.row [key, v , ("%0.2f%%" % (v.quo(n_valid)*100))]
815
- end
816
- end
817
- end
818
-
819
- s.text _("median: %s") % median.to_s if(@type==:numeric or @type==:numeric)
820
- if(@type==:numeric)
821
- s.text _("mean: %0.4f") % mean
822
- if sd
823
- s.text _("std.dev.: %0.4f") % sd
824
- s.text _("std.err.: %0.4f") % se
825
- s.text _("skew: %0.4f") % skew
826
- s.text _("kurtosis: %0.4f") % kurtosis
827
- end
828
- end
829
- end
830
- end
831
-
832
- # Variance of p, according to poblation size
833
- def variance_proportion(n_poblation, v=1)
834
- Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
835
- end
836
- # Variance of p, according to poblation size
837
- def variance_total(n_poblation, v=1)
838
- Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
839
- end
840
- def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
841
- Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin)
842
- end
843
- def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
844
- Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
845
- end
846
-
847
- self.instance_methods.find_all{|met| met=~/_slow$/}.each do |met|
848
- met_or=met.gsub("_slow","")
849
- if !self.method_defined?(met_or)
850
- alias_method met_or, met
851
- end
852
- end
853
-
854
- ######
855
- ### numeric Methods
856
- ######
857
-
858
- # == Percentil
859
- # Returns the value of the percentile q
860
- #
861
- # Accepts an optional second argument specifying the strategy to interpolate
862
- # when the requested percentile lies between two data points a and b
863
- # Valid strategies are:
864
- # * :midpoint (Default): (a + b) / 2
865
- # * :linear : a + (b - a) * d where d is the decimal part of the index between a and b.
866
- # This is the NIST recommended method (http://en.wikipedia.org/wiki/Percentile#NIST_method)
867
- #
868
- def percentil(q, strategy = :midpoint)
869
- check_type :numeric
870
- sorted=@valid_data.sort
871
-
872
- case strategy
873
- when :midpoint
874
- v = (n_valid * q).quo(100)
875
- if(v.to_i!=v)
876
- sorted[v.to_i]
877
- else
878
- (sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2)
879
- end
880
- when :linear
881
- index = (q / 100.0) * (n_valid + 1)
882
-
883
- k = index.truncate
884
- d = index % 1
885
-
886
- if k == 0
887
- sorted[0]
888
- elsif k >= sorted.size
889
- sorted[-1]
890
- else
891
- sorted[k - 1] + d * (sorted[k] - sorted[k - 1])
892
- end
893
- else
894
- raise NotImplementedError.new "Unknown strategy #{strategy.to_s}"
895
- end
896
- end
897
-
898
- # Returns a ranked vector.
899
- def ranked(type=:numeric)
900
- check_type :numeric
901
- i=0
902
- r=frequencies.sort.inject({}){|a,v|
903
- a[v[0]]=(i+1 + i+v[1]).quo(2)
904
- i+=v[1]
905
- a
906
- }
907
- @data.collect {|c| r[c] }.to_vector(type)
908
- end
909
- # Return the median (percentil 50)
910
- def median
911
- check_type :numeric
912
- percentil(50)
913
- end
914
- # Minimun value
915
- def min
916
- check_type :numeric
917
- @valid_data.min
918
- end
919
- # Maximum value
920
- def max
921
- check_type :numeric
922
- @valid_data.max
923
- end
924
-
925
- def set_date_data
926
- @date_data_with_nils=@data.collect do|x|
927
- if x.is_a? Date
928
- x
929
- elsif x.is_a? Time
930
- Date.new(x.year, x.month, x.day)
931
- elsif x.is_a? String and x=~/(\d{4,4})[-\/](\d{1,2})[-\/](\d{1,2})/
932
- Date.new($1.to_i,$2.to_i,$3.to_i)
933
- elsif @today_values.include? x
934
- Date.today()
935
- elsif @missing_values.include? x or x.nil?
936
- nil
937
- end
938
- end
939
- end
940
-
941
- def set_numeric_data
942
- @numeric_data=@valid_data.collect do|x|
943
- if x.is_a? Numeric
944
- x
945
- elsif x.is_a? String and x.to_i==x.to_f
946
- x.to_i
947
- else
948
- x.to_f
949
- end
950
- end
951
- end
952
-
953
- private :set_date_data, :set_numeric_data
954
-
955
- # The range of the data (max - min)
956
- def range;
957
- check_type :numeric
958
- @numeric_data.max - @numeric_data.min
959
- end
960
- # The sum of values for the data
961
- def sum
962
- check_type :numeric
963
- @numeric_data.inject(0){|a,x|x+a} ;
964
- end
965
- # The arithmetical mean of data
966
- def mean
967
- check_type :numeric
968
- sum.to_f.quo(n_valid)
969
- end
970
- # Sum of squares for the data around a value.
971
- # By default, this value is the mean
972
- # ss= sum{(xi-m)^2}
973
- #
974
- def sum_of_squares(m=nil)
975
- check_type :numeric
976
- m||=mean
977
- @numeric_data.inject(0){|a,x| a+(x-m).square}
978
- end
979
- # Sum of squared deviation
980
- def sum_of_squared_deviation
981
- check_type :numeric
982
- @numeric_data.inject(0) {|a,x| x.square+a} - (sum.square.quo(n_valid))
983
- end
984
-
985
- # Population variance (denominator N)
986
- def variance_population(m=nil)
987
- check_type :numeric
988
- m||=mean
989
- squares=@numeric_data.inject(0){|a,x| x.square+a}
990
- squares.quo(n_valid) - m.square
991
- end
992
-
993
-
994
- # Population Standard deviation (denominator N)
995
- def standard_deviation_population(m=nil)
996
- check_type :numeric
997
- Math::sqrt( variance_population(m) )
998
- end
999
-
1000
- # Population average deviation (denominator N)
1001
- # author: Al Chou
1002
-
1003
- def average_deviation_population( m = nil )
1004
- check_type :numeric
1005
- m ||= mean
1006
- ( @numeric_data.inject( 0 ) { |a, x| ( x - m ).abs + a } ).quo( n_valid )
1007
- end
1008
- def median_absolute_deviation
1009
- med=median
1010
- recode {|x| (x-med).abs}.median
1011
- end
1012
- alias :mad :median_absolute_deviation
1013
- # Sample Variance (denominator n-1)
1014
- def variance_sample(m=nil)
1015
- check_type :numeric
1016
- m||=mean
1017
- sum_of_squares(m).quo(n_valid - 1)
1018
- end
1019
-
1020
- # Sample Standard deviation (denominator n-1)
1021
- def standard_deviation_sample(m=nil)
1022
- check_type :numeric
1023
- m||=mean
1024
- Math::sqrt(variance_sample(m))
1025
- end
1026
- # Skewness of the sample
1027
- def skew(m=nil)
1028
- check_type :numeric
1029
- m||=mean
1030
- th=@numeric_data.inject(0){|a,x| a+((x-m)**3)}
1031
- th.quo((@numeric_data.size)*sd(m)**3)
1032
- end
1033
- # Kurtosis of the sample
1034
- def kurtosis(m=nil)
1035
- check_type :numeric
1036
- m||=mean
1037
- fo=@numeric_data.inject(0){|a,x| a+((x-m)**4)}
1038
- fo.quo((@numeric_data.size)*sd(m)**4)-3
1039
-
1040
- end
1041
- # Product of all values on the sample
1042
- #
1043
- def product
1044
- check_type :numeric
1045
- @numeric_data.inject(1){|a,x| a*x }
1046
- end
1047
-
1048
- # With a fixnum, creates X bins within the range of data
1049
- # With an Array, each value will be a cut point
1050
- def histogram(bins=10)
1051
- check_type :numeric
1052
-
1053
- if bins.is_a? Array
1054
- #h=Statsample::Histogram.new(self, bins)
1055
- h=Statsample::Histogram.alloc(bins)
1056
- else
1057
- # ugly patch. The upper limit for a bin has the form
1058
- # x < range
1059
- #h=Statsample::Histogram.new(self, bins)
1060
- min,max=Statsample::Util.nice(@valid_data.min,@valid_data.max)
1061
- # fix last data
1062
- if max==@valid_data.max
1063
- max+=1e-10
1064
- end
1065
- h=Statsample::Histogram.alloc(bins,[min,max])
1066
- # Fix last bin
1067
-
1068
- end
1069
- h.increment(@valid_data)
1070
- h
1071
- end
1072
-
1073
- # Coefficient of variation
1074
- # Calculed with the sample standard deviation
1075
- def coefficient_of_variation
1076
- check_type :numeric
1077
- standard_deviation_sample.quo(mean)
1078
- end
1079
- # Standard error of the distribution mean
1080
- # Calculated using sd/sqrt(n)
1081
- def standard_error
1082
- standard_deviation_sample.quo(Math.sqrt(valid_data.size))
1083
- end
1084
- alias :se :standard_error
1085
-
1086
- alias_method :sdp, :standard_deviation_population
1087
- alias_method :sds, :standard_deviation_sample
1088
- alias_method :adp, :average_deviation_population
1089
- alias_method :cov, :coefficient_of_variation
1090
- alias_method :variance, :variance_sample
1091
- alias_method :sd, :standard_deviation_sample
1092
- alias_method :ss, :sum_of_squares
1093
- include_aliasing Statsample::Vector::GSL_ if Statsample.has_gsl?
1094
151
  end
1095
152
  end