statsample 0.3.4 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. data/History.txt +8 -0
  2. data/Manifest.txt +20 -2
  3. data/data/crime.txt +47 -0
  4. data/data/test_binomial.csv +201 -0
  5. data/demo/distribution_t.rb +2 -2
  6. data/demo/regression.rb +2 -1
  7. data/lib/distribution.rb +8 -0
  8. data/lib/distribution/chisquare.rb +24 -0
  9. data/lib/distribution/f.rb +25 -0
  10. data/lib/distribution/normal.rb +25 -0
  11. data/lib/distribution/t.rb +22 -0
  12. data/lib/matrix_extension.rb +78 -0
  13. data/lib/statistics2.rb +531 -0
  14. data/lib/statsample.rb +12 -9
  15. data/lib/statsample/anova.rb +1 -5
  16. data/lib/statsample/bivariate.rb +24 -20
  17. data/lib/statsample/combination.rb +14 -4
  18. data/lib/statsample/converters.rb +17 -1
  19. data/lib/statsample/dataset.rb +66 -10
  20. data/lib/statsample/dominanceanalysis/bootstrap.rb +1 -3
  21. data/lib/statsample/graph/gdchart.rb +2 -3
  22. data/lib/statsample/graph/svggraph.rb +8 -4
  23. data/lib/statsample/mle.rb +137 -0
  24. data/lib/statsample/mle/logit.rb +95 -0
  25. data/lib/statsample/mle/normal.rb +83 -0
  26. data/lib/statsample/mle/probit.rb +93 -0
  27. data/lib/statsample/regression.rb +3 -1
  28. data/lib/statsample/regression/binomial.rb +65 -0
  29. data/lib/statsample/regression/binomial/logit.rb +13 -0
  30. data/lib/statsample/regression/binomial/probit.rb +13 -0
  31. data/lib/statsample/regression/multiple.rb +61 -58
  32. data/lib/statsample/regression/multiple/rubyengine.rb +1 -1
  33. data/lib/statsample/srs.rb +5 -5
  34. data/lib/statsample/vector.rb +129 -59
  35. data/test/test_anova.rb +0 -5
  36. data/test/test_dataset.rb +13 -1
  37. data/test/test_distribution.rb +57 -0
  38. data/test/test_gsl.rb +22 -0
  39. data/test/test_logit.rb +22 -0
  40. data/test/test_mle.rb +140 -0
  41. data/test/test_r.rb +9 -0
  42. data/test/test_regression.rb +12 -4
  43. data/test/test_srs.rb +0 -4
  44. data/test/test_stata.rb +11 -0
  45. data/test/test_statistics.rb +0 -15
  46. data/test/test_vector.rb +11 -0
  47. metadata +28 -4
  48. data/lib/statsample/chidistribution.rb +0 -39
  49. data/lib/statsample/regression/logit.rb +0 -35
@@ -0,0 +1,13 @@
1
+ module Statsample
2
+ module Regression
3
+ module Binomial
4
+ # Logistic Regression
5
+ class Logit < BaseEngine
6
+ def initialize(ds,y_var)
7
+ model=Statsample::MLE::Logit.new
8
+ super(ds,y_var,model)
9
+ end
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,13 @@
1
+ module Statsample
2
+ module Regression
3
+ module Binomial
4
+ # Logistic Regression
5
+ class Probit < BaseEngine
6
+ def initialize(ds,y_var)
7
+ model=Statsample::MLE::Probit.new
8
+ super(ds,y_var,model)
9
+ end
10
+ end
11
+ end
12
+ end
13
+ end
@@ -1,6 +1,6 @@
1
1
  module Statsample
2
2
  module Regression
3
- # Module for Multiple Regression Analysis
3
+ # Module for Linear Multiple Regression Analysis
4
4
  # You can call Regression::Multiple.listwise or Regression::Multiple.pairwise or instance directly the engines
5
5
  # Example.
6
6
  #
@@ -35,7 +35,21 @@ module Multiple
35
35
  def self.pairwise(ds,y_var)
36
36
  RubyEngine.new(ds,y_var)
37
37
  end
38
-
38
+ def self.listwise_by_exp(ds,exp)
39
+ end
40
+ # Returns a dataset and name of criteria using a expression.
41
+ # All nominal vectors are replaced by dummy coding
42
+ # and interactions are calculated
43
+
44
+ def self.ds_by_exp(ds,exp)
45
+ raise "Not implemented"
46
+ parts=exp.split(/[\+=]/)
47
+ dependent=parts.pop
48
+ ds_out=[]
49
+ parts.each{|p|
50
+
51
+ }
52
+ end
39
53
  # Base class for Multiple Regression Engines
40
54
  class BaseEngine
41
55
  def initialize(ds,y_var)
@@ -119,11 +133,7 @@ module Multiple
119
133
  end
120
134
  # Significance of Fisher
121
135
  def significance
122
- if HAS_GSL
123
- GSL::Cdf.fdist_Q(f,df_r,df_e)
124
- else
125
- raise "Need Ruby/GSL"
126
- end
136
+ 1.0-Distribution::F.cdf(f,df_r,df_e)
127
137
  end
128
138
  # Tolerance for a given variable
129
139
  # http://talkstats.com/showthread.php?t=5056
@@ -136,13 +146,13 @@ module Multiple
136
146
  1-lr.r2
137
147
  end
138
148
  # Tolerances for each coefficient
139
- def coeffs_tolerances
140
- @fields.inject({}) {|a,f|
141
- a[f]=tolerance(f);
142
- a
143
- }
144
- end
145
- # Standard Error for coefficients
149
+ def coeffs_tolerances
150
+ @fields.inject({}) {|a,f|
151
+ a[f]=tolerance(f);
152
+ a
153
+ }
154
+ end
155
+ # Standard Error for coefficients
146
156
  def coeffs_se
147
157
  out={}
148
158
  mse=sse.quo(df_e)
@@ -163,7 +173,6 @@ module Multiple
163
173
  x=Matrix.columns(columns)
164
174
  matrix=((x.t*x)).inverse * mse
165
175
  matrix.collect {|i|
166
-
167
176
  Math::sqrt(i) if i>0
168
177
  }
169
178
  end
@@ -177,10 +186,10 @@ module Multiple
177
186
  end
178
187
  # Retrieves a summary for Regression
179
188
  def summary(report_type=ConsoleSummary)
180
- c=coeffs
181
- out=""
182
- out.extend report_type
183
- out.add <<HEREDOC
189
+ c=coeffs
190
+ out=""
191
+ out.extend report_type
192
+ out.add <<HEREDOC
184
193
  Summary for regression of #{@fields.join(',')} over #{@y_var}
185
194
  *************************************************************
186
195
  Engine: #{self.class}
@@ -190,45 +199,39 @@ r2=#{sprintf("%0.3f",r2)}
190
199
  Equation=#{sprintf("%0.3f",constant)}+#{@fields.collect {|k| sprintf("%0.3f%s",c[k],k)}.join(' + ')}
191
200
  HEREDOC
192
201
 
193
- out.add_line
194
- out.add "ANOVA TABLE"
195
-
196
- t=Statsample::ReportTable.new(%w{source ss df ms f s})
197
- begin
198
- t.add_row(["Regression", sprintf("%0.3f",ssr), df_r, sprintf("%0.3f",msr), sprintf("%0.3f",f), sprintf("%0.3f", significance)])
199
- rescue RuntimeError
200
- t.add_row(["Regression", sprintf("%0.3f",ssr), df_r, sprintf("%0.3f",msr), "???", "???"])
201
- end
202
- t.add_row(["Error", sprintf("%0.3f",sse), df_e, sprintf("%0.3f",mse)])
203
-
204
- t.add_row(["Total", sprintf("%0.3f",sst), df_r+df_e])
205
-
206
- out.parse_table(t)
207
-
208
- begin
209
- out.add "Beta coefficientes"
210
- sc=standarized_coeffs
211
- cse=coeffs_se
212
- t=Statsample::ReportTable.new(%w{coeff beta se t})
213
- t.add_row(["Constant", "-",constant_se, constant_t])
214
- @fields.each{|f|
215
- t.add_row([f, sprintf("%0.3f", sc[f]), sprintf("%0.3f", cse[f]), sprintf("%0.3f", c[f].quo(cse[f]))])
216
- }
217
- out.parse_table(t)
218
-
219
- rescue
220
-
221
- end
222
-
223
- out
224
- end
225
- def assign_names(c)
226
- a={}
227
- @fields.each_index {|i|
228
- a[@fields[i]]=c[i]
229
- }
230
- a
231
- end
202
+ out.add_line
203
+ out.add "ANOVA TABLE"
204
+
205
+ t=Statsample::ReportTable.new(%w{source ss df ms f s})
206
+ t.add_row(["Regression", sprintf("%0.3f",ssr), df_r, sprintf("%0.3f",msr), sprintf("%0.3f",f), sprintf("%0.3f", significance)])
207
+ t.add_row(["Error", sprintf("%0.3f",sse), df_e, sprintf("%0.3f",mse)])
208
+
209
+ t.add_row(["Total", sprintf("%0.3f",sst), df_r+df_e])
210
+
211
+ out.parse_table(t)
212
+
213
+ begin
214
+ out.add "Beta coefficientes"
215
+ sc=standarized_coeffs
216
+ cse=coeffs_se
217
+ t=Statsample::ReportTable.new(%w{coeff b beta se t})
218
+ t.add_row(["Constant", sprintf("%0.3f", constant), "-", sprintf("%0.3f", constant_se), sprintf("%0.3f", constant_t)])
219
+ @fields.each{|f|
220
+ t.add_row([f, sprintf("%0.3f", c[f]), sprintf("%0.3f", sc[f]), sprintf("%0.3f", cse[f]), sprintf("%0.3f", c[f].quo(cse[f]))])
221
+ }
222
+ out.parse_table(t)
223
+
224
+ rescue
225
+ end
226
+ out
227
+ end
228
+ def assign_names(c)
229
+ a={}
230
+ @fields.each_index {|i|
231
+ a[@fields[i]]=c[i]
232
+ }
233
+ a
234
+ end
232
235
 
233
236
 
234
237
  # Deprecated
@@ -26,7 +26,7 @@ class RubyEngine < BaseEngine
26
26
  obtain_y_vector
27
27
  @matrix_x = Bivariate.correlation_matrix(@ds_indep)
28
28
  @coeffs_stan=(@matrix_x.inverse * @matrix_y).column(0).to_a
29
- @min_n_valid=nil
29
+ @min_n_valid=nil
30
30
  end
31
31
  def min_n_valid
32
32
  if @min_n_valid.nil?
@@ -26,7 +26,7 @@ module Statsample
26
26
  end
27
27
  # Sample size estimation for proportions, infinite poblation
28
28
  def estimation_n0(d,prop,margin=0.95)
29
- t=GSL::Cdf.ugaussian_Pinv(1-(1-margin).quo(2))
29
+ t=Distribution::Normal.p_value(1-(1-margin).quo(2))
30
30
  var=prop*(1-prop)
31
31
  t**2*var.quo(d**2)
32
32
  end
@@ -39,13 +39,13 @@ module Statsample
39
39
  # Uses estimated proportion, sample without replacement.
40
40
 
41
41
  def proportion_confidence_interval_t(prop, n_sample, n_population, margin=0.95)
42
- t=GSL::Cdf.tdist_Pinv(1-((1-margin).quo(2)) , n_sample-1)
42
+ t = Distribution::T.p_value(1-((1-margin).quo(2)) , n_sample-1)
43
43
  proportion_confidence_interval(prop,n_sample,n_population, t)
44
44
  end
45
45
  # Proportion confidence interval with z values
46
46
  # Uses estimated proportion, sample without replacement.
47
47
  def proportion_confidence_interval_z(p, n_sample, n_population, margin=0.95)
48
- z=GSL::Cdf.ugaussian_Pinv(1-((1-margin).quo(2)))
48
+ z=Distribution::Normal.p_value(1-((1-margin).quo(2)))
49
49
  proportion_confidence_interval(p,n_sample,n_population, z)
50
50
  end
51
51
  # Proportion confidence interval with x value
@@ -137,13 +137,13 @@ module Statsample
137
137
  # Confidence Interval using T-Student
138
138
  # Use with n < 60
139
139
  def mean_confidence_interval_t(mean,s,n_sample,n_population,margin=0.95)
140
- t=GSL::Cdf.tdist_Pinv(1-((1-margin) / 2),n_sample-1)
140
+ t=Distribution::T.p_value(1-((1-margin) / 2),n_sample-1)
141
141
  mean_confidence_interval(mean,s,n_sample,n_population,t)
142
142
  end
143
143
  # Confidente Interval using Z
144
144
  # Use with n > 60
145
145
  def mean_confidence_interval_z(mean,s,n_sample,n_population,margin=0.95)
146
- z=GSL::Cdf.ugaussian_Pinv(1-((1-margin) / 2))
146
+ z=Distribution::Normal.p_value(1-((1-margin) / 2))
147
147
  mean_confidence_interval(mean,s,n_sample,n_population, z)
148
148
  end
149
149
  # Confidente interval using X.
@@ -42,20 +42,38 @@ module Statsample
42
42
 
43
43
  class Vector
44
44
  include Enumerable
45
- attr_reader :type, :data, :valid_data, :missing_values, :missing_data, :data_with_nils, :gsl
45
+ # Level of measurement. Could be :nominal, :ordinal or :scale
46
+ attr_reader :type
47
+ # Original data.
48
+ attr_reader :data
49
+ # Valid data. Equal to data, minus values assigned as missing values
50
+ attr_reader :valid_data
51
+ # Array of values considered as missing. Nil is a missing value, by default
52
+ attr_reader :missing_values
53
+ # Missing values array
54
+ attr_reader :missing_data
55
+ # Original data, with all missing values replaced by nils
56
+ attr_reader :data_with_nils
57
+ # GSL Object, only available with rbgsl extension and type==:scale
58
+ attr_reader :gsl
59
+ # Change label for specific values
46
60
  attr_accessor :labels
47
- # Creates a new
48
- # data = Array of data
49
- # t = level of meausurement. Could be:
50
- # [:nominal] : Nominal level of measurement
51
- # [:ordinal] : Ordinal level of measurement
52
- # [:scale] : Scale level of meausurement
53
- #
54
- def initialize(data=[],t=:nominal,missing_values=[],labels={})
61
+ # Creates a new Vector object.
62
+ # [data] Array of data.
63
+ # [type] Level of meausurement. See Vector#type
64
+ # [missing_values] Array of missing values. See Vector#missing_values
65
+ # [labels] Labels for data values
66
+ #
67
+ # The fast way to create a vector uses Array#to_vector. Remember
68
+ # to include as the first argument the level of measurement
69
+ #
70
+ # v=[1,2,3,4].to_vector(:scale)
71
+ #
72
+ def initialize(data=[], t=:nominal,missing_values=[],labels={})
55
73
  raise "Data should be an array" unless data.is_a? Array
56
- @data=data
57
- @missing_values=missing_values
58
- @labels=labels
74
+ @data=data
75
+ @missing_values=missing_values
76
+ @labels=labels
59
77
  @type=t
60
78
  @valid_data=[]
61
79
  @data_with_nils=[]
@@ -65,6 +83,9 @@ class Vector
65
83
  set_valid_data_intern
66
84
  self.type=t
67
85
  end
86
+ # Creates a duplicate of the Vector.
87
+ # Note: data, missing_values and labels are duplicated, so
88
+ # changes on original vector doesn't propages to copies.
68
89
  def dup
69
90
  Vector.new(@data.dup,@type,@missing_values.dup,@labels.dup)
70
91
  end
@@ -73,14 +94,17 @@ class Vector
73
94
  def dup_empty
74
95
  Vector.new([],@type,@missing_values.dup,@labels.dup)
75
96
  end
97
+ # Raises an exception if type of vector is inferior to t type
98
+ def check_type(t)
99
+ raise NoMethodError if (t==:scale and @type!=:scale) or (t==:ordinal and @type==:nominal)
100
+ end
101
+ private :check_type
102
+
76
103
  # Return a vector usign the standarized values for data
77
104
  # with sd with denominator N
78
105
  def vector_standarized_pop
79
106
  vector_standarized(true)
80
107
  end
81
- def check_type(t)
82
- raise NoMethodError if (t==:scale and @type!=:scale) or (t==:ordinal and @type==:nominal)
83
- end
84
108
  # Return a vector usign the standarized values for data
85
109
  # with sd with denominator n-1
86
110
 
@@ -114,48 +138,63 @@ class Vector
114
138
  }.to_vector(:scale)
115
139
  end
116
140
 
117
- # Vector equality
141
+ # Vector equality.
118
142
  # Two vector will be the same if their data, missing values, type, labels are equals
119
143
  def ==(v2)
120
144
  raise TypeError,"Argument should be a Vector" unless v2.instance_of? Statsample::Vector
121
145
  @data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels=v2.labels
122
146
  end
123
147
 
124
- def _dump(i)
148
+ def _dump(i) # :nodoc:
125
149
  Marshal.dump({'data'=>@data,'missing_values'=>@missing_values, 'labels'=>@labels, 'type'=>@type})
126
150
  end
127
- def self._load(data)
151
+
152
+ def self._load(data) # :nodoc:
128
153
  h=Marshal.load(data)
129
154
  Vector.new(h['data'], h['type'], h['missing_values'], h['labels'])
130
155
  end
156
+ # Returns a new vector, with data modified by block.
157
+ # Equivalent to create a Vector after #collect on data
131
158
  def recode
132
159
  @data.collect{|x|
133
160
  yield x
134
161
  }.to_vector(@type)
135
162
  end
163
+ # Modifies current vector, with data modified by block.
164
+ # Equivalent to #collect! on @data
165
+
136
166
  def recode!
137
167
  @data.collect!{|x|
138
168
  yield x
139
169
  }
140
170
  set_valid_data
141
171
  end
172
+ # Iterate on each item
173
+ # Equivalent to
174
+ # @data.each{|x| yield x}
142
175
  def each
143
- @data.each{|x|
144
- yield(x)
145
- }
176
+ @data.each{|x| yield(x) }
146
177
  end
178
+
179
+ # Iterate on each item_index
180
+
147
181
  def each_index
148
182
  (0...@data.size).each {|i|
149
183
  yield(i)
150
184
  }
151
185
  end
152
- # Add a value at the end of the vector
153
- # If second argument set to false, you should update valid data usign
186
+ # Add a value at the end of the vector.
187
+ # If second argument set to false, you should update the Vector usign
154
188
  # Vector#set_valid_data at the end of your insertion cycle
189
+ #
155
190
  def add(v,update_valid=true)
156
191
  @data.push(v)
157
192
  set_valid_data if update_valid
158
193
  end
194
+ # Update valid_data, missing_data, data_with_nils and gsl
195
+ # at the end of an insertion
196
+ #
197
+ # Use after add(v,false)
159
198
  def set_valid_data
160
199
  @valid_data.clear
161
200
  @missing_data.clear
@@ -186,6 +225,7 @@ class Vector
186
225
  end
187
226
  @has_missing_data=@missing_data.size>0
188
227
  end
228
+
189
229
  # Retrieves true if data has one o more missing values
190
230
  def has_missing_data?
191
231
  @has_missing_data
@@ -193,7 +233,7 @@ class Vector
193
233
  def labeling(x)
194
234
  @labels.has_key?(x) ? @labels[x].to_s : x.to_s
195
235
  end
196
- # Returns a Vector with the data with labels replaced by the label
236
+ # Returns a Vector with the data with labels replaced by the label.
197
237
  def vector_labeled
198
238
  d=@data.collect{|x|
199
239
  if @labels.has_key? x
@@ -204,12 +244,18 @@ class Vector
204
244
  }
205
245
  Vector.new(d,@type)
206
246
  end
247
+ # Size of total data
207
248
  def size
208
249
  @data.size
209
250
  end
251
+ alias_method :n, :size
252
+
253
+ # Retrieves i element of data
210
254
  def [](i)
211
255
  @data[i]
212
256
  end
257
+ # Set i element of data.
258
+ # Note: Use set_valid_data if you include missing values
213
259
  def []=(i,v)
214
260
  @data[i]=v
215
261
  end
@@ -227,7 +273,7 @@ class Vector
227
273
  @type=t
228
274
  set_scale_data if(t==:scale)
229
275
  end
230
- def n; @data.size ; end
276
+
231
277
  def to_a
232
278
  @data.dup
233
279
  end
@@ -292,10 +338,11 @@ class Vector
292
338
  end
293
339
 
294
340
  end
295
- # Return an array with the data splitted by a separator
296
- # a=Vector.new(["a,b","c,d","a,b","d"])
297
- # a.splitted
298
- # [["a","b"],["c","d"],["a","b"],["d"]]
341
+ # Return an array with the data splitted by a separator.
342
+ # a=Vector.new(["a,b","c,d","a,b","d"])
343
+ # a.splitted
344
+ # =>
345
+ # [["a","b"],["c","d"],["a","b"],["d"]]
299
346
  def splitted(sep=Statsample::SPLIT_TOKEN)
300
347
  @data.collect{|x|
301
348
  if x.nil?
@@ -311,11 +358,14 @@ class Vector
311
358
  # defined on the fields
312
359
  # Example:
313
360
  #
314
- # a=Vector.new(["a,b","c,d","a,b"])
361
+ # a=Vector.new(["a,b","c,d","a,b"])
315
362
  # a.split_by_separator
316
- # {"a"=>#<Statsample::Type::Nominal:0x7f2dbcc09d88 @data=[1, 0, 1]>,
317
- # "b"=>#<Statsample::Type::Nominal:0x7f2dbcc09c48 @data=[1, 1, 0]>,
318
- # "c"=>#<Statsample::Type::Nominal:0x7f2dbcc09b08 @data=[0, 1, 1]>}
363
+ # => {"a"=>#<Statsample::Type::Nominal:0x7f2dbcc09d88
364
+ # @data=[1, 0, 1]>,
365
+ # "b"=>#<Statsample::Type::Nominal:0x7f2dbcc09c48
366
+ # @data=[1, 1, 0]>,
367
+ # "c"=>#<Statsample::Type::Nominal:0x7f2dbcc09b08
368
+ # @data=[0, 1, 1]>}
319
369
  #
320
370
  def split_by_separator(sep=Statsample::SPLIT_TOKEN)
321
371
  split_data=splitted(sep)
@@ -353,7 +403,7 @@ class Vector
353
403
  # In all the trails, every item have the same probability
354
404
  # of been selected
355
405
  def sample_with_replacement(sample=1)
356
- if(@type!=:scale)
406
+ if(@type!=:scale or !HAS_GSL)
357
407
  vds=@valid_data.size
358
408
  (0...sample).collect{ @valid_data[rand(vds)] }
359
409
  else
@@ -368,7 +418,7 @@ class Vector
368
418
  # A sample of the same size of the vector is the vector itself
369
419
 
370
420
  def sample_without_replacement(sample=1)
371
- if(@type!=:scale)
421
+ if(@type!=:scale or !HAS_GSL)
372
422
  raise ArgumentError, "Sample size couldn't be greater than n" if sample>@valid_data.size
373
423
  out=[]
374
424
  size=@valid_data.size
@@ -393,7 +443,8 @@ class Vector
393
443
  frequencies[x].nil? ? 0 : frequencies[x]
394
444
  end
395
445
  end
396
- # returns the real type for the vector, according to its content
446
+ # returns the database type for the vector, according to its content
447
+
397
448
  def db_type(dbs='mysql')
398
449
  # first, detect any character not number
399
450
  if @data.find {|v| v.to_s=~/\d{2,2}-\d{2,2}-\d{4,4}/} or @data.find {|v| v.to_s=~/\d{4,4}-\d{2,2}-\d{2,2}/}
@@ -416,15 +467,28 @@ class Vector
416
467
  end
417
468
  def to_s
418
469
  sprintf("Vector(type:%s, n:%d)[%s]",@type.to_s,@data.size, @data.collect{|d| d.nil? ? "nil":d}.join(","))
470
+ end
471
+ # Ugly name. Really, create a Vector for standard 'matrix' package.
472
+ # <tt>dir</tt> could. be :horizontal or :vertical
473
+ def to_matrix(dir=:horizontal)
474
+ case dir
475
+ when :horizontal
476
+ Matrix[@data]
477
+ when :vertical
478
+ Matrix.columns([@data])
479
+ end
419
480
  end
420
481
  def inspect
421
482
  self.to_s
422
483
  end
484
+ def as_r
485
+ @data.dup
486
+ end
423
487
  def factors
424
488
  if @type==:scale
425
489
  @scale_data.uniq.sort
426
490
  else
427
- @valid_data.uniq.sort
491
+ @valid_data.uniq.sort
428
492
  end
429
493
  end
430
494
  if Statsample::STATSAMPLE__.respond_to?(:frequencies)
@@ -472,16 +536,16 @@ class Vector
472
536
  end
473
537
 
474
538
 
475
- # Returns the most frequent item
539
+ # Returns the most frequent item.
476
540
  def mode
477
541
  frequencies.max{|a,b| a[1]<=>b[1]}[0]
478
542
  end
479
- # The numbers of item with valid data
543
+ # The numbers of item with valid data.
480
544
  def n_valid
481
545
  @valid_data.size
482
546
  end
483
547
  # Returns a hash with the distribution of proportions of
484
- # the sample
548
+ # the sample.
485
549
  def proportions
486
550
  frequencies.inject({}){|a,v|
487
551
  a[v[0]] = v[1].quo(n_valid)
@@ -512,13 +576,11 @@ class Vector
512
576
  out
513
577
  end
514
578
 
515
-
516
-
517
-
518
579
  # Variance of p, according to poblation size
519
580
  def variance_proportion(n_poblation, v=1)
520
581
  Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
521
582
  end
583
+ # Variance of p, according to poblation size
522
584
  def variance_total(n_poblation, v=1)
523
585
  Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
524
586
  end
@@ -534,7 +596,10 @@ class Vector
534
596
  alias_method met_or, met
535
597
  end
536
598
  }
537
- # Ordinal Methods
599
+ ######
600
+ ### Ordinal Methods
601
+ ######
602
+
538
603
  # Return the value of the percentil q
539
604
  def percentil(q)
540
605
  check_type :ordinal
@@ -546,7 +611,7 @@ class Vector
546
611
  (sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2)
547
612
  end
548
613
  end
549
- # Returns a ranked vector
614
+ # Returns a ranked vector.
550
615
  def ranked(type=:ordinal)
551
616
  check_type :ordinal
552
617
  i=0
@@ -593,6 +658,8 @@ class Vector
593
658
  @gsl=GSL::Vector.alloc(@scale_data) if @scale_data.size>0
594
659
  end
595
660
  end
661
+ private :set_scale_data
662
+
596
663
  # The range of the data (max - min)
597
664
  def range;
598
665
  check_type :scale
@@ -608,9 +675,12 @@ class Vector
608
675
 
609
676
  sum.to_f.quo(n_valid)
610
677
  end
678
+ # Sum of squares for the data around a value.
679
+ # By default, this value is the mean
680
+ # ss= sum{(xi-m)^2}
681
+ #
611
682
  def sum_of_squares(m=nil)
612
683
  check_type :scale
613
-
614
684
  m||=mean
615
685
  @scale_data.inject(0){|a,x| a+(x-m).square}
616
686
  end
@@ -618,27 +688,25 @@ class Vector
618
688
  # Sum of squared deviation
619
689
  def sum_of_squared_deviation
620
690
  check_type :scale
621
-
622
691
  @scale_data.inject(0) {|a,x| x.square+a} - (sum.square.quo(n_valid))
623
692
  end
624
693
 
625
- # Population variance (divided by n)
694
+ # Population variance (denominator N)
626
695
  def variance_population(m=nil)
627
696
  check_type :scale
628
-
629
697
  m||=mean
630
698
  squares=@scale_data.inject(0){|a,x| x.square+a}
631
699
  squares.quo(n_valid) - m.square
632
700
  end
633
701
 
634
702
 
635
- # Population Standard deviation (divided by n)
703
+ # Population Standard deviation (denominator N)
636
704
  def standard_deviation_population(m=nil)
637
705
  check_type :scale
638
706
 
639
707
  Math::sqrt( variance_population(m) )
640
708
  end
641
- # Sample Variance (divided by n-1)
709
+ # Sample Variance (denominator n-1)
642
710
 
643
711
  def variance_sample(m=nil)
644
712
  check_type :scale
@@ -647,7 +715,7 @@ class Vector
647
715
  sum_of_squares(m).quo(n_valid - 1)
648
716
  end
649
717
 
650
- # Sample Standard deviation (divided by n-1)
718
+ # Sample Standard deviation (denominator n-1)
651
719
 
652
720
  def standard_deviation_sample(m=nil)
653
721
  check_type :scale
@@ -655,13 +723,14 @@ class Vector
655
723
  m||=m
656
724
  Math::sqrt(variance_sample(m))
657
725
  end
726
+ # Skewness of the sample
658
727
  def skew
659
728
  check_type :scale
660
-
661
729
  m=mean
662
730
  thirds=@scale_data.inject(0){|a,x| a+((x-mean)**3)}
663
731
  thirds.quo((@scale_data.size-1)*sd**3)
664
732
  end
733
+ # Kurtosis of the sample
665
734
  def kurtosis
666
735
  check_type :scale
667
736
 
@@ -670,9 +739,10 @@ class Vector
670
739
  thirds.quo((@scale_data.size-1)*sd**4)
671
740
 
672
741
  end
742
+ # Product of all values on the sample
743
+ #
673
744
  def product
674
745
  check_type :scale
675
-
676
746
  @scale_data.inject(1){|a,x| a*x }
677
747
  end
678
748
  if HAS_GSL
@@ -712,11 +782,11 @@ class Vector
712
782
  m||=mean
713
783
  @gsl.sd_with_fixed_mean(m)
714
784
  end
715
- def skew
785
+ def skew # :nodoc:
716
786
  check_type :scale
717
787
  @gsl.skew
718
788
  end
719
- def kurtosis
789
+ def kurtosis # :nodoc:
720
790
  check_type :scale
721
791
  @gsl.kurtosis
722
792
  end
@@ -752,8 +822,8 @@ class Vector
752
822
  alias_method :sdp, :standard_deviation_population
753
823
  alias_method :sds, :standard_deviation_sample
754
824
  alias_method :cov, :coefficient_of_variation
755
- alias_method :variance, :variance_sample
756
- alias_method :sd, :standard_deviation_sample
757
- alias_method :ss, :sum_of_squares
825
+ alias_method :variance, :variance_sample
826
+ alias_method :sd, :standard_deviation_sample
827
+ alias_method :ss, :sum_of_squares
758
828
  end
759
829
  end